In [2]:
from pyspark.sql import SparkSession
ps = SparkSession.builder.appName("Practice").getOrCreate()

In [5]:
df = ps.read.csv("spark.csv",header = True,inferSchema = True)

In [20]:
df.show()

+-----+-------+---+------+
|Index|   Name|Age|Salary|
+-----+-------+---+------+
|    1|Kaushal| 26| 42000|
|    2| Satyam| 24| 25000|
|    3| Shivam| 26| 30000|
|    4|Ranjeet| 25| 40000|
|    5|Ghanavi| 23| 50000|
|    6|Kaushal| 26| 25000|
|    7| Satyam| 24| 16000|
|    8|Ranjeet| 25| 20000|
|    9|Ghanavi| 23| 15000|
+-----+-------+---+------+



In [19]:
df.printSchema()
df=df.withColumnRenamed('Sr.No',"Index")

root
 |-- Sr.No: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [21]:
# Vector Assembler to group all independent features
from pyspark.ml.feature import VectorAssembler
va= VectorAssembler(inputCols=['Index','Age'],outputCol='independent')

In [22]:
output = va.transform(df)

In [23]:
output.show()

+-----+-------+---+------+-----------+
|Index|   Name|Age|Salary|independent|
+-----+-------+---+------+-----------+
|    1|Kaushal| 26| 42000| [1.0,26.0]|
|    2| Satyam| 24| 25000| [2.0,24.0]|
|    3| Shivam| 26| 30000| [3.0,26.0]|
|    4|Ranjeet| 25| 40000| [4.0,25.0]|
|    5|Ghanavi| 23| 50000| [5.0,23.0]|
|    6|Kaushal| 26| 25000| [6.0,26.0]|
|    7| Satyam| 24| 16000| [7.0,24.0]|
|    8|Ranjeet| 25| 20000| [8.0,25.0]|
|    9|Ghanavi| 23| 15000| [9.0,23.0]|
+-----+-------+---+------+-----------+



In [24]:
output.columns

['Index', 'Name', 'Age', 'Salary', 'independent']

In [25]:
final=output.select(['independent','salary'])

In [26]:
final.show()

+-----------+------+
|independent|salary|
+-----------+------+
| [1.0,26.0]| 42000|
| [2.0,24.0]| 25000|
| [3.0,26.0]| 30000|
| [4.0,25.0]| 40000|
| [5.0,23.0]| 50000|
| [6.0,26.0]| 25000|
| [7.0,24.0]| 16000|
| [8.0,25.0]| 20000|
| [9.0,23.0]| 15000|
+-----------+------+



In [27]:
from pyspark.ml.regression import LinearRegression
train,test=final.randomSplit([0.75,0.25])
lm = LinearRegression(featuresCol='independent',labelCol='salary')
lm = lm.fit(train)

In [28]:
lm.coefficients

DenseVector([-5012.9397, -5415.6894])

In [29]:
lm.intercept

190059.8463404566

In [30]:
pred_res=lm.evaluate(test)

In [31]:
pred_res.predictions.show()

+-----------+------+-----------------+
|independent|salary|       prediction|
+-----------+------+-----------------+
| [2.0,24.0]| 25000|50057.42013748351|
| [4.0,25.0]| 40000|34615.85119288301|
+-----------+------+-----------------+

