In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('agg').getOrCreate()

In [4]:
spark

In [7]:
df_pyspark = spark.read.csv('test3.csv',header = True , inferSchema = True)

In [9]:
df_pyspark.show()


+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|Rajesh| 28|         5| 75000|
| Priya| 25|         3| 60000|
|  Anil| 30|         7| 90000|
|Sunita| 27|         4| 70000|
|Vikram| 35|        10|120000|
|  Neha| 29|         6| 85000|
| Arjun| 32|         8| 95000|
| Pooja| 26|         3| 62000|
|Ramesh| 40|        15|150000|
+------+---+----------+------+



In [11]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [23]:
#group by 

df_pyspark.groupBy('name').sum('salary').show()

+------+-----------+
|  name|sum(salary)|
+------+-----------+
|Ramesh|     150000|
| Priya|      60000|
|Vikram|     120000|
|Sunita|      70000|
|Rajesh|      75000|
| Pooja|      62000|
| Arjun|      95000|
|  Neha|      85000|
|  Anil|      90000|
+------+-----------+



In [31]:
## group by experience 

df_pyspark.groupby('age').mean('salary').show()


+---+-----------+
|age|avg(salary)|
+---+-----------+
| 28|    75000.0|
| 27|    70000.0|
| 26|    62000.0|
| 40|   150000.0|
| 35|   120000.0|
| 25|    60000.0|
| 29|    85000.0|
| 32|    95000.0|
| 30|    90000.0|
+---+-----------+



In [39]:
df_pyspark.groupby('age').count().show()

+---+-----+
|age|count|
+---+-----+
| 28|    1|
| 27|    1|
| 26|    1|
| 40|    1|
| 35|    1|
| 25|    1|
| 29|    1|
| 32|    1|
| 30|    1|
+---+-----+



In [41]:
df_pyspark.groupby('name').max('salary').show()

+------+-----------+
|  name|max(salary)|
+------+-----------+
|Ramesh|     150000|
| Priya|      60000|
|Vikram|     120000|
|Sunita|      70000|
|Rajesh|      75000|
| Pooja|      62000|
| Arjun|      95000|
|  Neha|      85000|
|  Anil|      90000|
+------+-----------+



In [43]:
training = spark.read.csv('test3.csv', header = True , inferSchema = True)

In [45]:
training.show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|Rajesh| 28|         5| 75000|
| Priya| 25|         3| 60000|
|  Anil| 30|         7| 90000|
|Sunita| 27|         4| 70000|
|Vikram| 35|        10|120000|
|  Neha| 29|         6| 85000|
| Arjun| 32|         8| 95000|
| Pooja| 26|         3| 62000|
|Ramesh| 40|        15|150000|
+------+---+----------+------+



In [47]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [49]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [None]:
[Age,experience] ----> new feature ----> independent feature

In [57]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ["Age", "Experience"],outputCol = "independent feature")

In [59]:
output = featureassembler.transform(training)

In [61]:
output.show()

+------+---+----------+------+-------------------+
|  Name|Age|Experience|Salary|independent feature|
+------+---+----------+------+-------------------+
|Rajesh| 28|         5| 75000|         [28.0,5.0]|
| Priya| 25|         3| 60000|         [25.0,3.0]|
|  Anil| 30|         7| 90000|         [30.0,7.0]|
|Sunita| 27|         4| 70000|         [27.0,4.0]|
|Vikram| 35|        10|120000|        [35.0,10.0]|
|  Neha| 29|         6| 85000|         [29.0,6.0]|
| Arjun| 32|         8| 95000|         [32.0,8.0]|
| Pooja| 26|         3| 62000|         [26.0,3.0]|
|Ramesh| 40|        15|150000|        [40.0,15.0]|
+------+---+----------+------+-------------------+



In [63]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'independent feature']

In [65]:
final_data = output.select('independent feature', 'salary')

In [67]:
final_data.show()

+-------------------+------+
|independent feature|salary|
+-------------------+------+
|         [28.0,5.0]| 75000|
|         [25.0,3.0]| 60000|
|         [30.0,7.0]| 90000|
|         [27.0,4.0]| 70000|
|        [35.0,10.0]|120000|
|         [29.0,6.0]| 85000|
|         [32.0,8.0]| 95000|
|         [26.0,3.0]| 62000|
|        [40.0,15.0]|150000|
+-------------------+------+



In [78]:
from pyspark.ml.regression import LinearRegression
train_data,test_data = final_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol = 'independent feature',labelCol = 'salary')
regressor = regressor.fit(train_data)

In [80]:
regressor.coefficients

DenseVector([201.6129, 6717.7419])

In [82]:
regressor.intercept

36612.90322583728

In [86]:
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

+-------------------+------+------------------+
|independent feature|salary|        prediction|
+-------------------+------+------------------+
|         [28.0,5.0]| 75000| 75846.77419354836|
|        [35.0,10.0]|120000|110846.77419354694|
|        [40.0,15.0]|150000|145443.54838709673|
+-------------------+------+------------------+



In [88]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(4852.150537634899, 35086606.833168864)