In [43]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import corr
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [44]:
dtFrame = spark.read.csv(path='/Users/manoj/desktop/cleaneddata.csv', header = True, inferSchema = True)

In [45]:
print((dtFrame.count(), len(dtFrame.columns)))

(30162, 10)


In [46]:
dtFrame.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sector: string (nullable = true)
 |-- Educational_Qualification: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Relationship: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Income: integer (nullable = true)



In [47]:
dtFrame1 = dtFrame.describe().show(5)

+-------+------------------+-----------+-------------------------+--------------+----------------+------------+------------------+------+----------+-------------------+
|summary|               Age|     Sector|Educational_Qualification|Marital_Status|      Occupation|Relationship|              Race|Gender|   Country|             Income|
+-------+------------------+-----------+-------------------------+--------------+----------------+------------+------------------+------+----------+-------------------+
|  count|             30162|      30162|                    30162|         30162|           30162|       30162|             30162| 30162|     30162|              30162|
|   mean|38.437901995888865|       null|                     null|          null|            null|        null|              null|  null|      null|0.24892248524633645|
| stddev|13.134664776856031|       null|                     null|          null|            null|        null|              null|  null|      null| 0.4323

In [48]:
dtFrame.select(corr('Age','Income')).show()

+-------------------+
|  corr(Age, Income)|
+-------------------+
|0.24199813626611658|
+-------------------+



In [49]:
assembler = VectorAssembler(inputCols=['Income'], outputCol='features')
dtFrame2 = assembler.transform(dtFrame)

In [50]:
dtFrame2 = dtFrame2.select(['features','Age'])

In [86]:
train,test = dtFrame2.randomSplit([0.7, 0.3])

In [87]:
train.show(5)

+--------+---+
|features|Age|
+--------+---+
|   [0.0]| 17|
|   [0.0]| 17|
|   [0.0]| 17|
|   [0.0]| 17|
|   [0.0]| 17|
+--------+---+
only showing top 5 rows



In [88]:
test.show(5)

+--------+---+
|features|Age|
+--------+---+
|   [0.0]| 17|
|   [0.0]| 17|
|   [0.0]| 17|
|   [0.0]| 17|
|   [0.0]| 17|
+--------+---+
only showing top 5 rows



In [89]:
lr = LinearRegression(featuresCol = 'features', labelCol='Age')
lrModel = lr.fit(train)

In [90]:
print(f'Intercept: {lrModel.intercept}\nCoefficient: {lrModel.coefficients.values}')

Intercept: 36.690519456106266
Coefficient: [7.23600961]


In [91]:
trainSummary = lrModel.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("\nr2: %f" % trainSummary.r2)

RMSE: 12.770703

r2: 0.056583


In [92]:
from  pyspark.sql.functions import abs
predictions = lrModel.transform(test)
x =((predictions['Age']-predictions['prediction'])/predictions['Age'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
predictions.select("prediction","Age","Accuracy","features").show(5)

+------------------+---+------------------+--------+
|        prediction|Age|          Accuracy|features|
+------------------+---+------------------+--------+
|36.690519456106266| 17|115.82658503591921|   [0.0]|
|36.690519456106266| 17|115.82658503591921|   [0.0]|
|36.690519456106266| 17|115.82658503591921|   [0.0]|
|36.690519456106266| 17|115.82658503591921|   [0.0]|
|36.690519456106266| 17|115.82658503591921|   [0.0]|
+------------------+---+------------------+--------+
only showing top 5 rows



In [93]:
from pyspark.ml.evaluation import RegressionEvaluator
pred_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Age",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.0630872


In [94]:
r2 = trainSummary.r2
n = dtFrame.count()
p = len(dtFrame.columns)
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)

In [95]:
lin_reg = LinearRegression(featuresCol = 'features', labelCol='Age',maxIter=50, regParam=0.12, elasticNetParam=0.2)
linear_model = lin_reg.fit(train)

In [96]:
linear_model.summary.rootMeanSquaredError

12.77078802265797