## Spark MLlib


In [147]:
## predicting the salary of the person based on age and experience

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practise').getOrCreate()
spark

In [148]:
training = spark.read.csv(r'C:\Users\harish\git\Problem_Solving\spark_tutorial.csv', header = True, inferSchema=True)

training.show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Harry| 27|         2| 25000|
|  Jerry| 27|         4| 40000|
|  param| 65|        40|200000|
|  Laksh| 54|        30| 30000|
|  Babji| 33|        10| 50000|
|   Anni| 32|        10| 55000|
| vishwa|  1|         0|105000|
|karthik| 30|         5|100000|
|   arun| 31|         4| 45000|
+-------+---+----------+------+



In [149]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [150]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [151]:
# [Age, Experience] ----> new feature ----> independent feature

In [152]:
from pyspark.ml.feature import VectorAssembler

#combining Age and Experience column together
featureassembler = VectorAssembler(inputCols=["Age","Experience"], outputCol="Independent Features")

In [153]:
output = featureassembler.transform(training)

In [154]:
output.show()

+-------+---+----------+------+--------------------+
|   Name|Age|Experience|Salary|Independent Features|
+-------+---+----------+------+--------------------+
|  Harry| 27|         2| 25000|          [27.0,2.0]|
|  Jerry| 27|         4| 40000|          [27.0,4.0]|
|  param| 65|        40|200000|         [65.0,40.0]|
|  Laksh| 54|        30| 30000|         [54.0,30.0]|
|  Babji| 33|        10| 50000|         [33.0,10.0]|
|   Anni| 32|        10| 55000|         [32.0,10.0]|
| vishwa|  1|         0|105000|           [1.0,0.0]|
|karthik| 30|         5|100000|          [30.0,5.0]|
|   arun| 31|         4| 45000|          [31.0,4.0]|
+-------+---+----------+------+--------------------+



In [155]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [156]:
finalized_data = output.select("Independent Features","Salary")

finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [27.0,2.0]| 25000|
|          [27.0,4.0]| 40000|
|         [65.0,40.0]|200000|
|         [54.0,30.0]| 30000|
|         [33.0,10.0]| 50000|
|         [32.0,10.0]| 55000|
|           [1.0,0.0]|105000|
|          [30.0,5.0]|100000|
|          [31.0,4.0]| 45000|
+--------------------+------+



In [185]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent Features', labelCol="Salary")
regressor = regressor.fit(train_data)

In [186]:
### Coefficients

regressor.coefficients

DenseVector([-3055.9206, 7244.4369])

In [187]:
### Intercepts

regressor.intercept

102523.98733378021

In [188]:
### Prediction
pred_results = regressor.evaluate(test_data)

In [189]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|          [31.0,4.0]| 45000|36768.19570301179|
|         [54.0,30.0]| 30000|154837.3819734943|
+--------------------+------+-----------------+



In [190]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|          [31.0,4.0]| 45000|36768.19570301179|
|         [54.0,30.0]| 30000|154837.3819734943|
+--------------------+------+-----------------+



In [191]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(66534.59313524126, 7826067269.990018)