In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()
spark

In [4]:
df_pyspark = spark.read.csv('Test3.csv', header = True, inferSchema = True)

In [5]:
df_pyspark.show()s

+--------+---+-----------+-------+
|    Name|Age|Experience | Salary|
+--------+---+-----------+-------+
|  Gaurav| 20|         10| 900000|
|  Roopam| 30|         21|1000000|
| Dhairya| 10|          9|  20000|
|Abhilash| 30|         10|  20000|
|  Naveen| 10|          9|  10000|
|  Naresh| 30|         12|  30000|
+--------+---+-----------+-------+



In [6]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience : integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [7]:
df_pyspark.columns

['Name', 'Age', 'Experience ', 'Salary']

In [8]:
###Vector Assembler

In [9]:
##All the independent feature will be grouped together using Vector Assembler and it will be treated as a new feature.

In [10]:
from pyspark.ml.feature import VectorAssembler

In [15]:
featureassembler = VectorAssembler(inputCols = ['Age','Experience '], outputCol = 'Independent Feature')

In [16]:
output = featureassembler.transform(df_pyspark)

In [17]:
output.show()

+--------+---+-----------+-------+-------------------+
|    Name|Age|Experience | Salary|Independent Feature|
+--------+---+-----------+-------+-------------------+
|  Gaurav| 20|         10| 900000|        [20.0,10.0]|
|  Roopam| 30|         21|1000000|        [30.0,21.0]|
| Dhairya| 10|          9|  20000|         [10.0,9.0]|
|Abhilash| 30|         10|  20000|        [30.0,10.0]|
|  Naveen| 10|          9|  10000|         [10.0,9.0]|
|  Naresh| 30|         12|  30000|        [30.0,12.0]|
+--------+---+-----------+-------+-------------------+



In [18]:
output.columns

['Name', 'Age', 'Experience ', 'Salary', 'Independent Feature']

In [19]:
finalized_data = output.select('Independent Feature', 'Salary')

In [20]:
finalized_data.show()

+-------------------+-------+
|Independent Feature| Salary|
+-------------------+-------+
|        [20.0,10.0]| 900000|
|        [30.0,21.0]|1000000|
|         [10.0,9.0]|  20000|
|        [30.0,10.0]|  20000|
|         [10.0,9.0]|  10000|
|        [30.0,12.0]|  30000|
+-------------------+-------+



In [23]:
from pyspark.ml.regression import LinearRegression
####train test split
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol = 'Independent Feature', labelCol = 'Salary')
regressor = regressor.fit(train_data)

In [24]:
regressor.coefficients

DenseVector([-96082.5243, 94805.8252])

In [25]:
regressor.intercept

1873592.2330096979

In [26]:
####Prediction
pred_results = regressor.evaluate(test_data)

In [27]:
pred_results.predictions.show()

+-------------------+------+------------------+
|Independent Feature|Salary|        prediction|
+-------------------+------+------------------+
|         [10.0,9.0]| 10000|1766019.4174757213|
|         [10.0,9.0]| 20000|1766019.4174757213|
+-------------------+------+------------------+



In [28]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(1751019.4174757213, 3066094000377.015)