In [1]:
import pyspark
#import pandas as pd
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression 
from pyspark.ml.feature import StringIndexer
# When working with PySpark, also need to start the PySpark session 

from pyspark.sql import SparkSession
# Get or Create new working app name
spark = SparkSession.builder.appName('practice3').getOrCreate()

# Import dataset
df = spark.read.csv('tips.csv', header = True, inferSchema = True)
print(df.printSchema())
df.show(5)

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)

None
+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [2]:
# Handling categorical features
# StringIndexer will ordinal encoding to change categorical into numberical values
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCols= ['sex', 'smoker', 'day', 'time'], outputCols= ['sex_indexed', 'smoked_indexed', 'day_indexed', 'time_indexed'])
df_r = indexer.fit(df).transform(df)
df_r.show(5)

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoked_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
only showing top 5 rows



In [3]:
# Group dependent and independent features together
featureassembler = VectorAssembler(inputCols = ["tip", "size", 'sex_indexed', 'smoked_indexed', 'day_indexed', 'time_indexed'], outputCol = "independent feature")
# Making the training data ready
output = featureassembler.transform(df_r)
final_df = output.select('total_bill', 'independent feature')
final_df.show(5)

+----------+--------------------+
|total_bill| independent feature|
+----------+--------------------+
|     16.99|[1.01,2.0,1.0,0.0...|
|     10.34|[1.66,3.0,0.0,0.0...|
|     21.01|[3.5,3.0,0.0,0.0,...|
|     23.68|[3.31,2.0,0.0,0.0...|
|     24.59|[3.61,4.0,1.0,0.0...|
+----------+--------------------+
only showing top 5 rows



In [4]:
# Split data into train and test

train_data, test_data = final_df.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol = "independent feature", labelCol = "total_bill")
regressor = regressor.fit(train_data)
print(f"LR Coeff: ", regressor.coefficients)
print(f"LR intercept: ", regressor.intercept)

LR Coeff:  [2.6066856774163667,3.7059028831099696,-0.4717825341784274,2.251695427417091,-0.8431668944334891,-0.18545269747894372]
LR intercept:  2.634702211405314


In [6]:
# predictions
pred_result = regressor.evaluate(test_data)
pred_result.predictions.show(5)

+----------+--------------------+------------------+
|total_bill| independent feature|        prediction|
+----------+--------------------+------------------+
|      3.07|[1.0,1.0,1.0,1.0,...|10.727203665170313|
|      7.56|[1.44,2.0,0.0,0.0...|11.928348866758899|
|      8.52|[1.48,2.0,0.0,0.0...|12.032616293855554|
|      8.58|[1.92,1.0,0.0,1.0...|10.882183641792388|
|      8.77|[2.0,2.0,0.0,0.0,...|14.416712438024497|
+----------+--------------------+------------------+
only showing top 5 rows



In [7]:
# Performance Metircs 
print(f"R-Squared :", pred_result.r2)
print(f"Mean Absolute Error :", pred_result.meanAbsoluteError)
print(f"Mean Squared Error :", pred_result.meanSquaredError)

R-Squared : 0.6782067649182508
Mean Absolute Error : 4.5688578480495
Mean Squared Error : 33.91810933942235
