## Examples of PySpark ML

In [1]:
# Using Ml to show how much salary should one get, based on
# age and experience
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('missing').getOrCreate()

In [2]:
## Read the dataset
training = spark.read.csv('test4.csv', header = True, inferSchema=True)

In [3]:
training.show()

+-----+---+----------+------+
| name|age|experience|salary|
+-----+---+----------+------+
|  ola| 25|         8| 10000|
|kasia| 44|        10| 12000|
| kuba| 55|         7|  5000|
| bolo| 28|         5|  6000|
| asdd| 33|         5|  4000|
+-----+---+----------+------+



In [4]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [5]:
training.columns

['name', 'age', 'experience', 'salary']

[age, experience] --> new feature --> independent feature

In [6]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['age', 'experience'],
                                 outputCol='Independent Features')

In [7]:
output=featureassembler.transform(training)

In [8]:
output.show()

+-----+---+----------+------+--------------------+
| name|age|experience|salary|Independent Features|
+-----+---+----------+------+--------------------+
|  ola| 25|         8| 10000|          [25.0,8.0]|
|kasia| 44|        10| 12000|         [44.0,10.0]|
| kuba| 55|         7|  5000|          [55.0,7.0]|
| bolo| 28|         5|  6000|          [28.0,5.0]|
| asdd| 33|         5|  4000|          [33.0,5.0]|
+-----+---+----------+------+--------------------+



In [12]:
finalized_data=output.select('Independent Features','salary')

In [13]:
finalized_data.show()

+--------------------+------+
|Independent Features|salary|
+--------------------+------+
|          [25.0,8.0]| 10000|
|         [44.0,10.0]| 12000|
|          [55.0,7.0]|  5000|
|          [28.0,5.0]|  6000|
|          [33.0,5.0]|  4000|
+--------------------+------+



In [15]:
from pyspark.ml.regression import LinearRegression
## train test split 
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol= 'Independent Features', labelCol='salary')
regressor=regressor.fit(train_data)

In [16]:
### Coefficients
regressor.coefficients

DenseVector([-101.2658, 1962.0253])

In [17]:
### Intercepts
regressor.intercept

-3164.5569620250067

In [18]:
### Prediction
pred_results=regressor.evaluate(test_data)

In [19]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|salary|       prediction|
+--------------------+------+-----------------+
|          [28.0,5.0]|  6000|3810.126582278602|
|          [33.0,5.0]|  4000|3303.797468354546|
+--------------------+------+-----------------+



In [20]:
training.show()

+-----+---+----------+------+
| name|age|experience|salary|
+-----+---+----------+------+
|  ola| 25|         8| 10000|
|kasia| 44|        10| 12000|
| kuba| 55|         7|  5000|
| bolo| 28|         5|  6000|
| asdd| 33|         5|  4000|
+-----+---+----------+------+



In [21]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(1443.037974683426, 2640121.775356168)