In [16]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import corr
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [17]:
dtFrame = spark.read.csv(path='/Users/manoj/desktop/cleaneddata.csv', header = True, inferSchema = True)

In [18]:
print((dtFrame.count(), len(dtFrame.columns)))

(30162, 10)


In [19]:
dtFrame.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sector: string (nullable = true)
 |-- Educational_Qualification: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Relationship: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Income: integer (nullable = true)



In [37]:
dfFrame1 = dtFrame.describe().show(5)

+-------+------------------+-----------+-------------------------+--------------+----------------+------------+------------------+------+----------+-------------------+
|summary|               Age|     Sector|Educational_Qualification|Marital_Status|      Occupation|Relationship|              Race|Gender|   Country|             Income|
+-------+------------------+-----------+-------------------------+--------------+----------------+------------+------------------+------+----------+-------------------+
|  count|             30162|      30162|                    30162|         30162|           30162|       30162|             30162| 30162|     30162|              30162|
|   mean|38.437901995888865|       null|                     null|          null|            null|        null|              null|  null|      null|0.24892248524633645|
| stddev|13.134664776856031|       null|                     null|          null|            null|        null|              null|  null|      null| 0.4323

In [38]:
dtFrame1.select(corr('Age','Income')).show()

+-------------------+
|  corr(Age, Income)|
+-------------------+
|0.24199813626611658|
+-------------------+



In [29]:
assembler = VectorAssembler(inputCols=['Age'], outputCol='features')
dtFrame2 = assembler.transform(dtFrame)

In [31]:
dtFrame2 = dtFrame2.select(['features','Income'])

In [32]:
lr = LinearRegression(maxIter=10, labelCol='Income')
lrModel = lr.fit(dtFrame2)

In [33]:
print(f'Intercept: {lrModel.intercept}\nCoefficient: {lrModel.coefficients.values}')

Intercept: -0.057298075321775326
Coefficient: [0.00796663]


In [34]:
modelsummary = lrModel.summary

In [35]:
print(f'Explained Variance: {modelsummary.explainedVariance}\nR Squared: {modelsummary.r2}')
print(f'Std. Error: {modelsummary.coefficientStandardErrors}\nRoot Mean Squared Err: {modelsummary.rootMeanSquaredError}')
print(f'Mean Absolute Err: {modelsummary.meanAbsoluteError}\nMean Squared Err: {modelsummary.meanSquaredError}')
print(f'P-value: {modelsummary.pValues}')
modelsummary.residuals.show(5)
print(f'Num Iterations: {modelsummary.totalIterations}\nObjective History: {modelsummary.objectiveHistory}')

Explained Variance: 0.01094896157178559
R Squared: 0.058563097956275234
Std. Error: [0.00018392577341383085, 0.007471068558553936]
Root Mean Squared Err: 0.4195367922046183
Mean Absolute Err: 0.35202224002668153
Mean Squared Err: 0.17601112001334107
P-value: [0.0, 1.7763568394002505e-14]
+--------------------+
|           residuals|
+--------------------+
| -0.5959656217777912|
|-0.37289996910964657|
|-0.26933377322800794|
|-0.21356736006097177|
|-0.24543388187070675|
+--------------------+
only showing top 5 rows

Num Iterations: 1
Objective History: [0.0]


In [36]:
modelsummary.predictions.show(5)

+--------+------+-------------------+
|features|Income|         prediction|
+--------+------+-------------------+
|  [82.0]|   0.0| 0.5959656217777912|
|  [54.0]|   0.0|0.37289996910964657|
|  [41.0]|   0.0|0.26933377322800794|
|  [34.0]|   0.0|0.21356736006097177|
|  [38.0]|   0.0|0.24543388187070675|
+--------+------+-------------------+
only showing top 5 rows

