# Part 5: Beginning MLlib and data science
## The following will be covered:
* Linear Regression

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Linear').getOrCreate()

In [2]:
# Read the dataset
training = spark.read.csv('pysparktest.csv',header=True,inferSchema=True)

In [4]:
training.show()

+------+---+------------+----------+------+
|  Name|Age|  Department|Experience|Salary|
+------+---+------------+----------+------+
|Jayden| 23|Data Science|         5|102000|
| Shawn| 27|   Insurance|         7| 65000|
|   Bob| 50|   Insurance|         4| 12345|
|Jeremy| 40|Data Science|         8| 69000|
|Joseph| 23|     Medical|         4| 95000|
|  Mary| 24|Data Science|         8|105000|
|  Jake| 38|Data Science|         7|100000|
| Larry| 34|     Medical|         2| 40000|
|  Adam| 41|         IOT|         5| 75000|
+------+---+------------+----------+------+



In [7]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [8]:
training.columns

['Name', 'Age', 'Department', 'Experience', 'Salary']

In [None]:
[Age,]

In [11]:
from pyspark.ml.feature import VectorAssembler
featureAssembler = VectorAssembler(inputCols=["Age","Experience"],outputCol="Independent Features")

In [12]:
output = featureAssembler.transform(training)

In [14]:
output.show()

+------+---+------------+----------+------+--------------------+
|  Name|Age|  Department|Experience|Salary|Independent Features|
+------+---+------------+----------+------+--------------------+
|Jayden| 23|Data Science|         5|102000|          [23.0,5.0]|
| Shawn| 27|   Insurance|         7| 65000|          [27.0,7.0]|
|   Bob| 50|   Insurance|         4| 12345|          [50.0,4.0]|
|Jeremy| 40|Data Science|         8| 69000|          [40.0,8.0]|
|Joseph| 23|     Medical|         4| 95000|          [23.0,4.0]|
|  Mary| 24|Data Science|         8|105000|          [24.0,8.0]|
|  Jake| 38|Data Science|         7|100000|          [38.0,7.0]|
| Larry| 34|     Medical|         2| 40000|          [34.0,2.0]|
|  Adam| 41|         IOT|         5| 75000|          [41.0,5.0]|
+------+---+------------+----------+------+--------------------+



In [15]:
finalized_data=output.select("Independent Features","Salary")

In [17]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [23.0,5.0]|102000|
|          [27.0,7.0]| 65000|
|          [50.0,4.0]| 12345|
|          [40.0,8.0]| 69000|
|          [23.0,4.0]| 95000|
|          [24.0,8.0]|105000|
|          [38.0,7.0]|100000|
|          [34.0,2.0]| 40000|
|          [41.0,5.0]| 75000|
+--------------------+------+



In [18]:
from pyspark.ml.regression import LinearRegression
# Train test split
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor = regressor.fit(train_data)

In [19]:
# Coefficients
regressor.coefficients

DenseVector([-2348.3176, 1343.2728])

In [20]:
# Intercepts
regressor.intercept

142178.96504903713

In [21]:
# Prediction
pred_results = regressor.evaluate(test_data)

In [22]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [24.0,8.0]|105000| 96565.52465517903|
|          [34.0,2.0]| 40000| 65022.71197202377|
|          [38.0,7.0]|100000|62345.805396006646|
+--------------------+------+------------------+



In [23]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(23703.793973612697, 705038286.6842188)