In [1]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('snoozer').getOrCreate()

In [4]:
spark

In [15]:
df = spark.read.csv('test.csv', header=True, inferSchema=True)

In [20]:
df = df.na.drop(subset='NAME')
df.show()

+-----+----+----------+------+
| NAME| AGE|EXPERIENCE|SALARY|
+-----+----+----------+------+
| JON |  31|        10| 30000|
|  VON|  53|        20| 60000|
|  RON|  25|         2| 10000|
|BJORK|  23|         1| 10000|
| DORK|null|      null| 30000|
+-----+----+----------+------+



In [21]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['AGE','EXPERIENCE'],
    outputCols=['AGE','EXPERIENCE']
).setStrategy("median")

df = imputer.fit(df).transform(df)
df.show()

+-----+---+----------+------+
| NAME|AGE|EXPERIENCE|SALARY|
+-----+---+----------+------+
| JON | 31|        10| 30000|
|  VON| 53|        20| 60000|
|  RON| 25|         2| 10000|
|BJORK| 23|         1| 10000|
| DORK| 25|         2| 30000|
+-----+---+----------+------+



In [23]:
df.printSchema()

root
 |-- NAME: string (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- EXPERIENCE: integer (nullable = true)
 |-- SALARY: integer (nullable = true)



In [25]:
df.columns

['NAME', 'AGE', 'EXPERIENCE', 'SALARY']

In [26]:
## merging features into a single independant feature.
from pyspark.ml.feature import VectorAssembler

featureAssembler = VectorAssembler(inputCols=['AGE','EXPERIENCE'], outputCol='AE')

In [27]:
output=featureAssembler.transform(df)

In [29]:
output.show()
output.columns

+-----+---+----------+------+-----------+
| NAME|AGE|EXPERIENCE|SALARY|         AE|
+-----+---+----------+------+-----------+
| JON | 31|        10| 30000|[31.0,10.0]|
|  VON| 53|        20| 60000|[53.0,20.0]|
|  RON| 25|         2| 10000| [25.0,2.0]|
|BJORK| 23|         1| 10000| [23.0,1.0]|
| DORK| 25|         2| 30000| [25.0,2.0]|
+-----+---+----------+------+-----------+



['NAME', 'AGE', 'EXPERIENCE', 'SALARY', 'AE']

In [31]:
final_data=output.select('AE','SALARY')
final_data.show()

+-----------+------+
|         AE|SALARY|
+-----------+------+
|[31.0,10.0]| 30000|
|[53.0,20.0]| 60000|
| [25.0,2.0]| 10000|
| [23.0,1.0]| 10000|
| [25.0,2.0]| 30000|
+-----------+------+



In [32]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = final_data.randomSplit([0.75,0.25])

regressor=LinearRegression(featuresCol='AE', labelCol='SALARY')
regressor=regressor.fit(train_data)

In [33]:
regressor.coefficients

DenseVector([-2000.0, 4000.0])

In [34]:
regressor.intercept

52000.00000000378

In [36]:
pred=regressor.evaluate(train_data)

In [38]:
pred.predictions.show()

+-----------+------+------------------+
|         AE|SALARY|        prediction|
+-----------+------+------------------+
| [23.0,1.0]| 10000|10000.000000000087|
| [25.0,2.0]| 10000|  9999.99999999989|
|[31.0,10.0]| 30000|30000.000000000022|
+-----------+------+------------------+



In [39]:
pred.meanAbsoluteError,pred.meanSquaredError

(7.275957614183426e-11, 6.670384459627615e-21)