# Pyspark tutorial 

https://www.youtube.com/watch?v=_C8kWso4ne4&t=10s

In [111]:
# Start a file session 

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('ML').getOrCreate()
spark

In [112]:
training = spark.read.csv('pyspark_tuto_files/test1.csv', header=True, inferSchema=True)

In [113]:
training.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [114]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [115]:
from pyspark.ml.feature import VectorAssembler

# Age, Experience => new feature => independent feature 

featureassembler=VectorAssembler(inputCols=['age','Experience'], outputCol="Independent feature")

In [116]:
output = featureassembler.transform(training)

In [117]:
output.show()

+---------+---+----------+------+-------------------+
|     Name|age|Experience|Salary|Independent feature|
+---------+---+----------+------+-------------------+
|    Krish| 31|        10| 30000|        [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|         [30.0,8.0]|
|    Sunny| 29|         4| 20000|         [29.0,4.0]|
|     Paul| 24|         3| 20000|         [24.0,3.0]|
|   Harsha| 21|         1| 15000|         [21.0,1.0]|
|  Shubham| 23|         2| 18000|         [23.0,2.0]|
+---------+---+----------+------+-------------------+



In [118]:
final_df = output.select(['Salary','Independent feature'])

In [119]:
final_df.show()

+------+-------------------+
|Salary|Independent feature|
+------+-------------------+
| 30000|        [31.0,10.0]|
| 25000|         [30.0,8.0]|
| 20000|         [29.0,4.0]|
| 20000|         [24.0,3.0]|
| 15000|         [21.0,1.0]|
| 18000|         [23.0,2.0]|
+------+-------------------+



In [120]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = final_df.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol="Independent feature", labelCol='Salary')
regressor = regressor.fit(train_data)

24/12/06 16:45:17 WARN Instrumentation: [cec388da] regParam is zero, which might cause numerical instability and overfitting.


In [121]:
regressor.coefficients

DenseVector([5000.0, -5000.0])

In [122]:
regressor.intercept

-84999.9999995608

In [123]:
pred_results = regressor.evaluate(test_data)

In [124]:
pred_results.predictions.show()

+------+-------------------+------------------+
|Salary|Independent feature|        prediction|
+------+-------------------+------------------+
| 18000|         [23.0,2.0]|19999.999999988155|
| 20000|         [29.0,4.0]|39999.999999911844|
| 30000|        [31.0,10.0]|20000.000000035623|
+------+-------------------+------------------+



In [125]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(10666.666666621459, 167999999.99857134)