In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Missing').getOrCreate()

In [4]:
training = spark.read.csv(r"C:\Users\JEYA KUMAR R\Downloads\DataSet\Book2.csv", header = True, inferSchema = True)

In [5]:
training.show()

+------+---+---+------+
|  Name|Age|Exp|Salary|
+------+---+---+------+
|  Jeya| 43| 10| 30000|
| Kumar| 44|  8| 25000|
|  Raja| 21|  3| 10000|
|Kannan| 33|  4| 15000|
|  John| 56|  1| 22000|
| peter| 32|  2| 29000|
|kamesh| 33|  2| 22200|
+------+---+---+------+



In [6]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Exp: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [7]:
training.columns

['Name', 'Age', 'Exp', 'Salary']

In [9]:
# [Age, Exp] -----> new feature-----> independent feature

In [10]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ["Age", "Exp"], outputCol = "Independent Features")

In [11]:
output = featureassembler.transform(training)

In [12]:
output.show()

+------+---+---+------+--------------------+
|  Name|Age|Exp|Salary|Independent Features|
+------+---+---+------+--------------------+
|  Jeya| 43| 10| 30000|         [43.0,10.0]|
| Kumar| 44|  8| 25000|          [44.0,8.0]|
|  Raja| 21|  3| 10000|          [21.0,3.0]|
|Kannan| 33|  4| 15000|          [33.0,4.0]|
|  John| 56|  1| 22000|          [56.0,1.0]|
| peter| 32|  2| 29000|          [32.0,2.0]|
|kamesh| 33|  2| 22200|          [33.0,2.0]|
+------+---+---+------+--------------------+



In [13]:
output.columns

['Name', 'Age', 'Exp', 'Salary', 'Independent Features']

In [14]:
finalized_data = output.select("Independent Features", "Salary")

In [15]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [43.0,10.0]| 30000|
|          [44.0,8.0]| 25000|
|          [21.0,3.0]| 10000|
|          [33.0,4.0]| 15000|
|          [56.0,1.0]| 22000|
|          [32.0,2.0]| 29000|
|          [33.0,2.0]| 22200|
+--------------------+------+



In [17]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regression = LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regression = regression.fit(train_data)

In [18]:
# Coefficients

regression.coefficients

DenseVector([388.2212, 1168.7067])

In [19]:
# Intercept

regression.intercept

723.3443370044564

In [20]:
# Prediction

pred_results = regression.evaluate(test_data)

In [22]:
pred_results.predictions.show()



+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [32.0,2.0]| 29000|15483.836288744122|
|          [44.0,8.0]| 25000|  27154.7310917005|
+--------------------+------+------------------+



In [23]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(7835.44740147819, 93664773.77350557)