### PySpark Machine Learning


In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('missing').getOrCreate()

In [3]:
## read dataset
training = spark.read.csv('mockdataML.csv',header=True,inferSchema=True)

In [4]:
training.show()

+----------+--------------------+---+------+----------+
|      name|         departament|age|salary|experience|
+----------+--------------------+---+------+----------+
|     Karyl|           Marketing| 42|  1641|        22|
|   Dee dee|          Accounting| 42|  1282|        22|
|   Chrisse|             Support| 38|  1053|        18|
|     Prudi|         Engineering| 38|  1537|        18|
|     Manon|            Training| 37|  1967|        17|
|   Jillian|            Training| 31|  1855|        11|
|  Merrilee|  Product Management| 26|  1372|         6|
|   Tamqrah|  Product Management| 28|  1345|         8|
|    Bronny|  Product Management| 42|  1959|        22|
|    Connie|          Accounting| 44|  1266|        24|
|  Packston|  Product Management| 31|  1638|        11|
|    Catlin|          Accounting| 36|  1011|        16|
|    Marris|             Support| 23|  1963|         3|
|  Winfield|           Marketing| 35|  1567|        15|
|    Shaina|               Legal| 32|  1129|    

In [5]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- departament: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [10]:
training.columns

['name', 'departament', 'age', 'salary', 'experience']

In [12]:
from pyspark.ml.feature import VectorAssembler

In [18]:
featureassemble=VectorAssembler(inputCols=['age', 'experience'],outputCol='independent')

In [19]:
output=featureassemble.transform(training)

In [20]:
output.show()

+----------+--------------------+---+------+----------+-----------+
|      name|         departament|age|salary|experience|independent|
+----------+--------------------+---+------+----------+-----------+
|     Karyl|           Marketing| 42|  1641|        22|[42.0,22.0]|
|   Dee dee|          Accounting| 42|  1282|        22|[42.0,22.0]|
|   Chrisse|             Support| 38|  1053|        18|[38.0,18.0]|
|     Prudi|         Engineering| 38|  1537|        18|[38.0,18.0]|
|     Manon|            Training| 37|  1967|        17|[37.0,17.0]|
|   Jillian|            Training| 31|  1855|        11|[31.0,11.0]|
|  Merrilee|  Product Management| 26|  1372|         6| [26.0,6.0]|
|   Tamqrah|  Product Management| 28|  1345|         8| [28.0,8.0]|
|    Bronny|  Product Management| 42|  1959|        22|[42.0,22.0]|
|    Connie|          Accounting| 44|  1266|        24|[44.0,24.0]|
|  Packston|  Product Management| 31|  1638|        11|[31.0,11.0]|
|    Catlin|          Accounting| 36|  1011|    

In [21]:
output.columns

['name', 'departament', 'age', 'salary', 'experience', 'independent']

In [22]:
finalized_data=output.select('independent','salary')

In [23]:
finalized_data.show()

+-----------+------+
|independent|salary|
+-----------+------+
|[42.0,22.0]|  1641|
|[42.0,22.0]|  1282|
|[38.0,18.0]|  1053|
|[38.0,18.0]|  1537|
|[37.0,17.0]|  1967|
|[31.0,11.0]|  1855|
| [26.0,6.0]|  1372|
| [28.0,8.0]|  1345|
|[42.0,22.0]|  1959|
|[44.0,24.0]|  1266|
|[31.0,11.0]|  1638|
|[36.0,16.0]|  1011|
| [23.0,3.0]|  1963|
|[35.0,15.0]|  1567|
|[32.0,12.0]|  1129|
|[40.0,20.0]|  1559|
|[42.0,22.0]|  1715|
| [20.0,0.0]|  1810|
|[32.0,12.0]|  1563|
| [24.0,4.0]|  1187|
+-----------+------+
only showing top 20 rows



In [24]:
from pyspark.ml.regression import LinearRegression
train_data,test_data = finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='independent', labelCol='salary')
regressor=regressor.fit(train_data)

In [25]:
regressor.coefficients

DenseVector([12.5984, -12.6682])

In [26]:
regressor.intercept

1211.9884779542056

In [27]:
pred_results=regressor.evaluate(test_data)

In [28]:
pred_results.predictions.show()

+-----------+------+------------------+
|independent|salary|        prediction|
+-----------+------+------------------+
| [21.0,1.0]|  1557|1463.8861806215873|
| [21.0,1.0]|  1960|1463.8861806215873|
| [23.0,3.0]|  1078|1463.7466236848413|
| [23.0,3.0]|  1963|1463.7466236848413|
| [24.0,4.0]|  1620| 1463.676845216468|
| [26.0,6.0]|  1410|1463.5372882797221|
| [26.0,6.0]|  1518|1463.5372882797221|
| [26.0,6.0]|  1676|1463.5372882797221|
| [27.0,7.0]|  1130| 1463.467509811349|
| [27.0,7.0]|  1365| 1463.467509811349|
| [27.0,7.0]|  1884| 1463.467509811349|
|[31.0,11.0]|  1638|1463.1883959378567|
|[31.0,11.0]|  1648|1463.1883959378567|
|[31.0,11.0]|  1855|1463.1883959378567|
|[33.0,13.0]|  1230|1463.0488390011105|
|[35.0,15.0]|  1567|1462.9092820643646|
|[40.0,20.0]|  1955|1462.5603897224992|
|[41.0,21.0]|  1289| 1462.490611254126|
|[42.0,22.0]|  1534| 1462.420832785753|
|[44.0,24.0]|  1377|1462.2812758490068|
+-----------+------+------------------+



In [29]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(235.74230243950205, 79718.84481676863)