In [1]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("Missing").getOrCreate()

In [2]:
training = spark.read.csv("test5.csv", header=True, inferSchema=True)

training.show()

+----+---+----------+------+
|name|age|experience|salary|
+----+---+----------+------+
|   a| 30|        10| 30000|
|   b| 21|         2| 20000|
|   c| 30|         8| 34000|
|   d| 34|        12| 12000|
|   e| 23|         5| 22000|
|   f| 50|        20| 50000|
+----+---+----------+------+



In [3]:
training.printSchema()


root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [4]:
training.columns

['name', 'age', 'experience', 'salary']

In [5]:
# [Age,Experience]----> new feature ----> independent feature

from pyspark.ml.feature import VectorAssembler

featureAssembler = VectorAssembler(
    inputCols=["age", "experience"], 
    outputCol="Independent Features")

In [6]:
output = featureAssembler.transform(training)

In [7]:
output.show()


+----+---+----------+------+--------------------+
|name|age|experience|salary|Independent Features|
+----+---+----------+------+--------------------+
|   a| 30|        10| 30000|         [30.0,10.0]|
|   b| 21|         2| 20000|          [21.0,2.0]|
|   c| 30|         8| 34000|          [30.0,8.0]|
|   d| 34|        12| 12000|         [34.0,12.0]|
|   e| 23|         5| 22000|          [23.0,5.0]|
|   f| 50|        20| 50000|         [50.0,20.0]|
+----+---+----------+------+--------------------+



In [8]:
output.columns

['name', 'age', 'experience', 'salary', 'Independent Features']

In [9]:
finaldata = output.select("Independent Features", "salary")

In [10]:
finaldata.show()


+--------------------+------+
|Independent Features|salary|
+--------------------+------+
|         [30.0,10.0]| 30000|
|          [21.0,2.0]| 20000|
|          [30.0,8.0]| 34000|
|         [34.0,12.0]| 12000|
|          [23.0,5.0]| 22000|
|         [50.0,20.0]| 50000|
+--------------------+------+



In [11]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = finaldata.randomSplit([0.7, 0.3])

regressor = LinearRegression(
    featuresCol="Independent Features", labelCol="salary")
regressor = regressor.fit(train_data)


In [12]:
regressor.coefficients

DenseVector([14416.6667, -24083.3333])

In [13]:
regressor.intercept


-189166.66666664983

In [14]:
pred_results = regressor.evaluate(test_data)

In [15]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|salary|        prediction|
+--------------------+------+------------------+
|          [21.0,2.0]| 20000| 65416.66666666177|
|          [30.0,8.0]| 34000| 50666.66666666404|
|         [30.0,10.0]| 30000|2500.0000000018917|
+--------------------+------+------------------+





In [16]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(29861.11111110797, 1032233796.296084)