<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local").appName("Practice").getOrCreate()
spark

In [21]:
dfspark = spark.read.csv(r'C:\Users\SHEHA\Downloads\GitHubRepository\SampleDatasets\test1.csv',header=True,inferSchema=True)
dfspark

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [22]:
dfspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     Jack| 27|         5| 17000|
|    Karim| 35|         7| 22000|
|     Mick| 20|         6| 13000|
+---------+---+----------+------+



In [23]:
# creating new independent feature with column age and Experience using VectorAssembler module
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=["age","Experience"],outputCol="Independent Features")

In [24]:
output=featureassembler.transform(dfspark)
output.show()

+---------+---+----------+------+--------------------+
|     Name|age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|    Krish| 31|        10| 30000|         [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|          [30.0,8.0]|
|    Sunny| 29|         4| 20000|          [29.0,4.0]|
|     Paul| 24|         3| 20000|          [24.0,3.0]|
|   Harsha| 21|         1| 15000|          [21.0,1.0]|
|  Shubham| 23|         2| 18000|          [23.0,2.0]|
|     Jack| 27|         5| 17000|          [27.0,5.0]|
|    Karim| 35|         7| 22000|          [35.0,7.0]|
|     Mick| 20|         6| 13000|          [20.0,6.0]|
+---------+---+----------+------+--------------------+



In [25]:
finalized_data=output.select("Independent Features","Salary")
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
|          [27.0,5.0]| 17000|
|          [35.0,7.0]| 22000|
|          [20.0,6.0]| 13000|
+--------------------+------+



In [26]:
from pyspark.ml.regression import LinearRegression
# train test split
train_data,test_data = finalized_data.randomSplit([0.75,0.25])

In [27]:
train_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [20.0,6.0]| 13000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
|          [29.0,4.0]| 20000|
|          [30.0,8.0]| 25000|
|          [35.0,7.0]| 22000|
+--------------------+------+



In [28]:
test_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [24.0,3.0]| 20000|
|          [27.0,5.0]| 17000|
|         [31.0,10.0]| 30000|
+--------------------+------+



In [29]:
regressor = LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor = regressor.fit(train_data)

In [31]:
# Coefficients
regressor.coefficients

DenseVector([612.2575, 118.109])

In [32]:
# Intercepts
regressor.intercept

2159.3778872806065

In [33]:
# Prediction
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [24.0,3.0]| 20000|17207.884200800738|
|          [27.0,5.0]| 17000|19280.874653526334|
|         [31.0,10.0]| 30000| 22320.44964582693|
+--------------------+------+------------------+



In [34]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(4250.846935632889, 23991264.487838905)

In [10]:
# working with tips dataset
df = spark.read.csv(r'C:\Users\SHEHA\Downloads\GitHubRepository\SampleDatasets\tips.csv',header=True,inferSchema=True)
df

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: int]

In [11]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [12]:
# Handling Categorical Features
from pyspark.ml.feature import StringIndexer

In [13]:
indexer = StringIndexer(inputCol="sex",outputCol="sex_indexed")
df_r = indexer.fit(df).transform(df)
df_r.show(5)

+----------+----+------+------+---+------+----+-----------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|
+----------+----+------+------+---+------+----+-----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|
+----------+----+------+------+---+------+----+-----------+
only showing top 5 rows



In [14]:
# Handling multiple Categorical Features
indexer=StringIndexer(inputCols=["smoker","day","time"],outputCols=["smoker_indexed","day_indexed","time_index"])
df_r=indexer.fit(df_r).transform(df_r)
df_r.show(5)

+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_index|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|       0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|       0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|       0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|       0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|       0.0|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+
only showing top 5 rows



In [15]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['tip','size','sex_indexed','smoker_indexed','day_indexed','time_index'],outputCol="Independent Features")
output=featureassembler.transform(df_r)

In [17]:
output.select('Independent Features').show(5)

+--------------------+
|Independent Features|
+--------------------+
|[1.01,2.0,1.0,0.0...|
|[1.66,3.0,0.0,0.0...|
|[3.5,3.0,0.0,0.0,...|
|[3.31,2.0,0.0,0.0...|
|[3.61,4.0,1.0,0.0...|
+--------------------+
only showing top 5 rows



In [18]:
output.show(5)

+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_index|Independent Features|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|       0.0|[1.01,2.0,1.0,0.0...|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|       0.0|[1.66,3.0,0.0,0.0...|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|       0.0|[3.5,3.0,0.0,0.0,...|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|       0.0|[3.31,2.0,0.0,0.0...|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|       0.0|[3.61,4.0,1.0,0.0...|
+----------+----+------+------+---+------+----+-

In [19]:
finalized_data=output.select("Independent Features","total_bill")
finalized_data.show(5)

+--------------------+----------+
|Independent Features|total_bill|
+--------------------+----------+
|[1.01,2.0,1.0,0.0...|     16.99|
|[1.66,3.0,0.0,0.0...|     10.34|
|[3.5,3.0,0.0,0.0,...|     21.01|
|[3.31,2.0,0.0,0.0...|     23.68|
|[3.61,4.0,1.0,0.0...|     24.59|
+--------------------+----------+
only showing top 5 rows



In [20]:
from pyspark.ml.regression import LinearRegression
##train test split
train_data,test_data = finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Independent Features', labelCol='total_bill')
regressor = regressor.fit(train_data)

In [21]:
regressor.coefficients

DenseVector([2.9205, 4.0674, -1.4471, 2.754, -0.4516, -0.1291])

In [24]:
regressor.intercept

0.8934485513167622

In [25]:
# Predictions
pred_results = regressor.evaluate(test_data)

In [26]:
# Final comparison
pred_results.predictions.show(5)

+--------------------+----------+------------------+
|Independent Features|total_bill|        prediction|
+--------------------+----------+------------------+
|(6,[0,1],[1.25,2.0])|     10.07| 12.67879029485173|
|(6,[0,1],[1.25,2.0])|     10.51| 12.67879029485173|
|(6,[0,1],[2.34,4.0])|     17.81|23.996850968581263|
| (6,[0,1],[2.5,4.0])|     18.35|24.464132038386698|
|(6,[0,1],[2.64,3.0])|     17.59| 20.80564878162644|
+--------------------+----------+------------------+
only showing top 5 rows



In [27]:
# Performance Metrics
pred_results.r2,pred_results.meanAbsoluteError,pred_results.meanSquaredError

(0.47196705413679585, 4.2732245187013165, 30.909413406985074)