<a href="https://colab.research.google.com/github/JarekMaleszyk/data-science-project-sandbox/blob/main/pyspark_test3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
APP_NAME: str = 'ML Practice'

In [27]:
try:
  import pyspark
except:
  !pip install pyspark -q
  import pyspark
finally:
  from pyspark.sql import SparkSession
  spark = SparkSession. \
      builder. \
      appName(APP_NAME). \
      getOrCreate()

### Check spark version

In [28]:
spark

### DataFrame API

In [29]:
df_pyspark = spark.read.csv('/content/simple_data2.csv', sep=';', header=True, inferSchema=True)
df_pyspark.show()

+-----+---+----------+-------+----------+
| name|age|experience| salary|department|
+-----+---+----------+-------+----------+
| Kris| 31|         6| 2230.3|        IT|
| Adam| 30|         5|2230.89|        HR|
|Sunny| 39|        14|3230.21|        HR|
|  Tom| 28|         5|1890.32|        IT|
| John| 40|        18| 3400.0|  HelpDesk|
| Mark| 36|        12|4730.04|        IT|
|  Bob| 35|         7|1930.05|        IT|
|Ellen| 34|         7|2230.96|  HelpDesk|
|  Jim| 46|        21|6230.99|        IT|
| Paul| 39|        11| 2710.5|        IT|
|  Tom| 38|        17|4280.45|        IT|
+-----+---+----------+-------+----------+



## VectorAssembler - a feature transformer that merges multiple columns into a vector column.

In [30]:
from pyspark.ml.feature import VectorAssembler
feature_assembler = VectorAssembler(inputCols=['age', 'experience'], outputCol='independent_feature')
output = feature_assembler.transform(df_pyspark)
output.show()

+-----+---+----------+-------+----------+-------------------+
| name|age|experience| salary|department|independent_feature|
+-----+---+----------+-------+----------+-------------------+
| Kris| 31|         6| 2230.3|        IT|         [31.0,6.0]|
| Adam| 30|         5|2230.89|        HR|         [30.0,5.0]|
|Sunny| 39|        14|3230.21|        HR|        [39.0,14.0]|
|  Tom| 28|         5|1890.32|        IT|         [28.0,5.0]|
| John| 40|        18| 3400.0|  HelpDesk|        [40.0,18.0]|
| Mark| 36|        12|4730.04|        IT|        [36.0,12.0]|
|  Bob| 35|         7|1930.05|        IT|         [35.0,7.0]|
|Ellen| 34|         7|2230.96|  HelpDesk|         [34.0,7.0]|
|  Jim| 46|        21|6230.99|        IT|        [46.0,21.0]|
| Paul| 39|        11| 2710.5|        IT|        [39.0,11.0]|
|  Tom| 38|        17|4280.45|        IT|        [38.0,17.0]|
+-----+---+----------+-------+----------+-------------------+



In [31]:
final_data = output.select(['independent_feature', 'salary'])
final_data.show()

+-------------------+-------+
|independent_feature| salary|
+-------------------+-------+
|         [31.0,6.0]| 2230.3|
|         [30.0,5.0]|2230.89|
|        [39.0,14.0]|3230.21|
|         [28.0,5.0]|1890.32|
|        [40.0,18.0]| 3400.0|
|        [36.0,12.0]|4730.04|
|         [35.0,7.0]|1930.05|
|         [34.0,7.0]|2230.96|
|        [46.0,21.0]|6230.99|
|        [39.0,11.0]| 2710.5|
|        [38.0,17.0]|4280.45|
+-------------------+-------+



### Linear Regression

In [32]:
from pyspark.ml.regression import LinearRegression
treain_data, test_data = final_data.randomSplit([0.75, 0.25]) #test split 75% to 25%
regressor = LinearRegression(featuresCol='independent_feature', labelCol='salary')
regressor = regressor.fit(treain_data)

In [33]:
regressor.coefficients

DenseVector([45.1422, 181.5727])

In [34]:
regressor.intercept

-617.1880700473035

In [35]:
pred_data=regressor.evaluate(test_data)
pred_data.predictions.show()

+-------------------+-------+------------------+
|independent_feature| salary|        prediction|
+-------------------+-------+------------------+
|         [30.0,5.0]|2230.89|1644.9419675018216|
|        [36.0,12.0]|4730.04|3186.8042779291522|
|        [39.0,14.0]|3230.21|3685.3763574134414|
+-------------------+-------+------------------+



In [37]:
pred_data.meanAbsoluteError, pred_data.meanSquaredError

(861.4500373274891, 977362.6678616791)

In [41]:
regressor.save('linear_regression_model')