In [21]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("Missing").getOrCreate()

In [22]:
training = spark.read.csv("test5.csv", header=True, inferSchema=True)

training.show()

+----+---+----------+------+
|name|age|experience|salary|
+----+---+----------+------+
|   a| 30|        10| 30000|
|   b| 21|         2| 20000|
|   c| 30|         8| 34000|
|   d| 34|        12| 12000|
|   e| 23|         5| 22000|
|   f| 50|        20| 50000|
+----+---+----------+------+



In [23]:
training.printSchema()


root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [24]:
training.columns

['name', 'age', 'experience', 'salary']

In [29]:
# [Age,Experience]----> new feature ----> independent feature

from pyspark.ml.feature import VectorAssembler

featureAssembler = VectorAssembler(
    inputCols=["age", "experience"], 
    outputCol="Independent Features")

In [32]:
output = featureAssembler.transform(training)

In [35]:
output.show()


+----+---+----------+------+--------------------+
|name|age|experience|salary|Independent Features|
+----+---+----------+------+--------------------+
|   a| 30|        10| 30000|         [30.0,10.0]|
|   b| 21|         2| 20000|          [21.0,2.0]|
|   c| 30|         8| 34000|          [30.0,8.0]|
|   d| 34|        12| 12000|         [34.0,12.0]|
|   e| 23|         5| 22000|          [23.0,5.0]|
|   f| 50|        20| 50000|         [50.0,20.0]|
+----+---+----------+------+--------------------+



In [37]:
output.columns

['name', 'age', 'experience', 'salary', 'Independent Features']

In [38]:
finaldata = output.select("Independent Features", "salary")

In [39]:
finaldata.show()


+--------------------+------+
|Independent Features|salary|
+--------------------+------+
|         [30.0,10.0]| 30000|
|          [21.0,2.0]| 20000|
|          [30.0,8.0]| 34000|
|         [34.0,12.0]| 12000|
|          [23.0,5.0]| 22000|
|         [50.0,20.0]| 50000|
+--------------------+------+



In [41]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = finaldata.randomSplit([0.7, 0.3])

regressor = LinearRegression(
    featuresCol="Independent Features", labelCol="salary")
regressor = regressor.fit(train_data)


In [43]:
regressor.coefficients

DenseVector([5257.1172, -7193.2874])

In [45]:
regressor.intercept


-72093.1974827742

In [47]:
pred_results = regressor.evaluate(test_data)

In [49]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|salary|        prediction|
+--------------------+------+------------------+
|          [30.0,8.0]| 34000| 28074.01857956279|
|         [30.0,10.0]| 30000|13687.443811806254|
+--------------------+------+------------------+



In [50]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(11119.268804315478, 150608372.59417257)