In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('missing').getOrCreate()

In [2]:
spark

In [3]:
training = spark.read.csv('test1.csv',header=True,inferSchema=True)

In [4]:
training.show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|Frank| 23|         1|   900|
| Kyle|  1|         3|   400|
| Jeff| 22|         4|   200|
| Fred| 43|         3|   600|
|Karry| 11|         1|   100|
|  Min| 23|         2|   300|
+-----+---+----------+------+



In [5]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [9]:
from pyspark.ml.feature import VectorAssembler
featureassembler= VectorAssembler(inputCols=["Age","Experience"],outputCol="Independent Features")

In [10]:
output = featureassembler.transform(training)

In [11]:
output.show()

+-----+---+----------+------+--------------------+
| Name|Age|Experience|Salary|Independent Features|
+-----+---+----------+------+--------------------+
|Frank| 23|         1|   900|          [23.0,1.0]|
| Kyle|  1|         3|   400|           [1.0,3.0]|
| Jeff| 22|         4|   200|          [22.0,4.0]|
| Fred| 43|         3|   600|          [43.0,3.0]|
|Karry| 11|         1|   100|          [11.0,1.0]|
|  Min| 23|         2|   300|          [23.0,2.0]|
+-----+---+----------+------+--------------------+



In [12]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [13]:
finalized_data=output.select("Independent Features","Salary")

In [14]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [23.0,1.0]|   900|
|           [1.0,3.0]|   400|
|          [22.0,4.0]|   200|
|          [43.0,3.0]|   600|
|          [11.0,1.0]|   100|
|          [23.0,2.0]|   300|
+--------------------+------+



In [15]:
from pyspark.ml.regression import LinearRegression

In [16]:
train_data, test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor= regressor.fit(train_data)

In [17]:
regressor.coefficients

DenseVector([12.9261, -201.9744])

In [18]:
regressor.intercept

646.2369220384753

In [19]:
pred_results=regressor.evaluate(test_data)

In [20]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|           [1.0,3.0]|   400|53.239959500506984|
|          [11.0,1.0]|   100| 586.4495443806958|
+--------------------+------+------------------+

