# Linear Regression with PySparkML

In [2]:
import findspark
findspark.init()

import pyspark
import pyspark.sql as sql
ss = sql.SparkSession.builder.appName('ML').getOrCreate()


In [3]:
dataset = ss.read.csv('/home/garvit/BigData Training Stuff/PySpark/practice/Salary_Data.csv', inferSchema= True, header=True)

In [4]:
dataset.columns

['YearsExperience', 'Salary']

In [5]:
dataset.printSchema()

root
 |-- YearsExperience: double (nullable = true)
 |-- Salary: double (nullable = true)



In [6]:
dataset.select('*').show()

+---------------+-------+
|YearsExperience| Salary|
+---------------+-------+
|            1.1|39343.0|
|            1.3|46205.0|
|            1.5|37731.0|
|            2.0|43525.0|
|            2.2|39891.0|
|            2.9|56642.0|
|            3.0|60150.0|
|            3.2|54445.0|
|            3.2|64445.0|
|            3.7|57189.0|
|            3.9|63218.0|
|            4.0|55794.0|
|            4.0|56957.0|
|            4.1|57081.0|
|            4.5|61111.0|
|            4.9|67938.0|
|            5.1|66029.0|
|            5.3|83088.0|
|            5.9|81363.0|
|            6.0|93940.0|
+---------------+-------+
only showing top 20 rows



In [7]:
df2_pandas = dataset.toPandas()

In [8]:
df2_pandas.head(5)

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [9]:
df3 = ss.createDataFrame(df2_pandas)

In [10]:
df2_pandas.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [11]:
df2_pandas[df2_pandas.isnull()].count()

YearsExperience    0
Salary             0
dtype: int64

In [None]:
df3.describe()

In [26]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as fun

### measuring correlation btw "dependent" and "independent" variables

In [27]:
df3.select(fun.corr('YearsExperience','Salary')).show()

+-----------------------------+
|corr(YearsExperience, Salary)|
+-----------------------------+
|           0.9782416184887598|
+-----------------------------+



In [28]:
df3.columns

['YearsExperience', 'Salary']

### converting columns of independent variables into features

In [39]:
assembler = VectorAssembler(inputCols=['YearsExperience'], outputCol='features')

In [40]:
features_df = assembler.transform(df3)

In [41]:
features_df.show(5)

+---------------+-------+--------+
|YearsExperience| Salary|features|
+---------------+-------+--------+
|            1.1|39343.0|   [1.1]|
|            1.3|46205.0|   [1.3]|
|            1.5|37731.0|   [1.5]|
|            2.0|43525.0|   [2.0]|
|            2.2|39891.0|   [2.2]|
+---------------+-------+--------+
only showing top 5 rows



##### -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [42]:
features_df.printSchema()

root
 |-- YearsExperience: double (nullable = true)
 |-- Salary: double (nullable = true)
 |-- features: vector (nullable = true)



In [48]:
features_df.select('features','Salary').show(5)

+--------+-------+
|features| Salary|
+--------+-------+
|   [1.1]|39343.0|
|   [1.3]|46205.0|
|   [1.5]|37731.0|
|   [2.0]|43525.0|
|   [2.2]|39891.0|
+--------+-------+
only showing top 5 rows



In [49]:
model_df = features_df.select('features','Salary')

In [56]:
print([model_df.count(),len(model_df.columns)])

[30, 2]


### Test & Train split

In [57]:
train, test = model_df.randomSplit([0.7,0.3])

### Build and Train Linear Regression Model

In [99]:
from pyspark.ml.regression import LinearRegression
regressor = LinearRegression(labelCol='Salary')

### predicting train data 

In [110]:
lr_model_train = regressor.fit(train)
result_train = lr_model_train.transform(train)
result_train.show()

+--------+--------+------------------+
|features|  Salary|        prediction|
+--------+--------+------------------+
|   [1.3]| 46205.0|37466.574534332656|
|   [1.5]| 37731.0| 39392.00542819356|
|   [2.2]| 39891.0|46131.013556706734|
|   [2.9]| 56642.0|  52870.0216852199|
|   [3.2]| 54445.0| 55758.16802601126|
|   [3.2]| 64445.0| 55758.16802601126|
|   [4.0]| 55794.0| 63459.89160145488|
|   [4.0]| 56957.0| 63459.89160145488|
|   [4.9]| 67938.0| 72124.33062382895|
|   [5.9]| 81363.0|  81751.4850931335|
|   [7.9]|101302.0|101005.79403174254|
|   [8.2]|113812.0|103893.94037253389|
|   [8.7]|109431.0|108707.51760718615|
|   [9.5]|116969.0|116409.24118262978|
|   [9.6]|112635.0|117371.95662956023|
+--------+--------+------------------+



In [127]:
evaluation_train = lr_model_train.evaluate(train)
evaluation_train.r2

0.9614190328800436

### predicting test data 

In [123]:
lr_model_test = regressor.fit(test)
result_test = lr_model_test.transform(test)
result_test.show()

+--------+--------+------------------+
|features|  Salary|        prediction|
+--------+--------+------------------+
|   [1.1]| 39343.0|36947.681478251536|
|   [2.0]| 43525.0| 45282.78466555964|
|   [3.0]| 60150.0|54544.010429235306|
|   [3.7]| 57189.0| 61026.86846380828|
|   [3.9]| 63218.0| 62879.11361654341|
|   [4.1]| 57081.0| 64731.35876927854|
|   [4.5]| 61111.0| 68435.84907474881|
|   [5.1]| 66029.0|  73992.5845329542|
|   [5.3]| 83088.0| 75844.82968568934|
|   [6.0]| 93940.0| 82327.68772026231|
|   [6.8]| 91738.0| 89736.66833120285|
|   [7.1]| 98273.0| 92515.03606030555|
|   [9.0]|105582.0|110111.36501128932|
|  [10.3]|122391.0| 122150.9585040677|
|  [10.5]|121872.0|124003.20365680283|
+--------+--------+------------------+



In [126]:
evaluation_test = lr_model_test.evaluate(test)
evaluation_test.r2

0.9522252158415931