# Ice-cream Revenue Prediction
- Independant variable X: Outside Air Temperature
- Dependant variable Y: Overall daily revenue generated in dollars

In [0]:
dbutils.fs.ls("dbfs:/FileStore/tables/")

[FileInfo(path='dbfs:/FileStore/tables/Admission_Chance.csv', name='Admission_Chance.csv', size=12905, modificationTime=1720190058000),
 FileInfo(path='dbfs:/FileStore/tables/Cancer.csv', name='Cancer.csv', size=125204, modificationTime=1720190099000),
 FileInfo(path='dbfs:/FileStore/tables/Credit_Default.csv', name='Credit_Default.csv', size=101152, modificationTime=1720190106000),
 FileInfo(path='dbfs:/FileStore/tables/Customer_Purchase.csv', name='Customer_Purchase.csv', size=1489, modificationTime=1720190113000),
 FileInfo(path='dbfs:/FileStore/tables/Fish.csv', name='Fish.csv', size=6349, modificationTime=1720190119000),
 FileInfo(path='dbfs:/FileStore/tables/Ice_Cream.csv', name='Ice_Cream.csv', size=4872, modificationTime=1720190124000),
 FileInfo(path='dbfs:/FileStore/tables/Test1.csv', name='Test1.csv', size=108, modificationTime=1720158698000),
 FileInfo(path='dbfs:/FileStore/tables/Test2.csv', name='Test2.csv', size=192, modificationTime=1720158698000),
 FileInfo(path='dbfs:

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, abs
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
spark = SparkSession.builder.appName('Ice-cream Revenue Prediction').getOrCreate() 

In [0]:
spark

In [0]:
df_pyspark = spark.read.csv('dbfs:/FileStore/tables/Ice_Cream.csv',header=True,inferSchema=True)

In [0]:
df_pyspark.printSchema()

root
 |-- Temperature: double (nullable = true)
 |-- Revenue: integer (nullable = true)



In [0]:
df_pyspark

DataFrame[Temperature: double, Revenue: int]

In [0]:
df_pyspark.show()

+-----------+-------+
|Temperature|Revenue|
+-----------+-------+
|       24.6|    535|
|       26.1|    626|
|       27.8|    661|
|       20.6|    488|
|       11.6|    317|
|       14.4|    368|
|       13.8|    309|
|       30.9|    697|
|        1.0|     56|
|       31.7|    738|
|       11.5|    326|
|        3.7|     72|
|       18.9|    468|
|       13.7|    290|
|       39.6|    906|
|       18.5|    470|
|       26.0|    649|
|       42.6|    922|
|       29.6|    650|
|       21.8|    535|
+-----------+-------+
only showing top 20 rows



# 1. Clean the DataFrame

In [0]:
# Handle missing values if necessary
df_pyspark = df_pyspark.na.drop()

# Prepare the DataFrame

In [0]:
# Define the feature columns
feature_columns = ["Temperature"]

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_pyspark = assembler.transform(df_pyspark)

# Select only the features and target column
df_pyspark = df_pyspark.select("features", "Revenue")

In [0]:
df_pyspark.show()

+--------+-------+
|features|Revenue|
+--------+-------+
|  [24.6]|    535|
|  [26.1]|    626|
|  [27.8]|    661|
|  [20.6]|    488|
|  [11.6]|    317|
|  [14.4]|    368|
|  [13.8]|    309|
|  [30.9]|    697|
|   [1.0]|     56|
|  [31.7]|    738|
|  [11.5]|    326|
|   [3.7]|     72|
|  [18.9]|    468|
|  [13.7]|    290|
|  [39.6]|    906|
|  [18.5]|    470|
|  [26.0]|    649|
|  [42.6]|    922|
|  [29.6]|    650|
|  [21.8]|    535|
+--------+-------+
only showing top 20 rows



# 3. Split the DataFrame

In [0]:
# Split the data into training and testing sets
train_data, test_data = df_pyspark.randomSplit([0.8, 0.2], seed=42)

# 4. Train the Model

In [0]:
# Initialize the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="Revenue")

# Fit the model on the training data
lr_model = lr.fit(train_data)

# 5. Evaluate the Model

In [0]:
# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Evaluate the model using RMSE
rmse_evaluator = RegressionEvaluator(labelCol="Revenue", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

# Evaluate the model using MAE
mae_evaluator = RegressionEvaluator(labelCol="Revenue", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

# Evaluate the model using MSE
mse_evaluator = RegressionEvaluator(labelCol="Revenue", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

# Calculate MAPE (Mean Absolute Percentage Error)
predictions = predictions.withColumn("absolute_error", abs(col("prediction") - col("Revenue")))
predictions = predictions.withColumn("percentage_error", col("absolute_error") / col("Revenue"))
mape = predictions.selectExpr("mean(percentage_error) as MAPE").collect()[0]["MAPE"] * 100

# Print the coefficients and intercept for linear regression
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")

# Print the evaluation metrics
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")
print(f"Mean Absolute Error (MAE) on test data: {mae}")
print(f"Mean Squared Error (MSE) on test data: {mse}")
print(f"Mean Absolute Percentage Error (MAPE) on test data: {mape}%")

Coefficients: [21.539685239639077]
Intercept: 41.612746064834965
Root Mean Squared Error (RMSE) on test data: 26.228082441634957
Mean Absolute Error (MAE) on test data: 19.41581472970844
Mean Squared Error (MSE) on test data: 687.9123085652
Mean Absolute Percentage Error (MAPE) on test data: 4.645202492023024%


In [0]:
# Show some sample predictions
predictions.select("prediction", "Revenue", "features").show(5)

+------------------+-------+--------+
|        prediction|Revenue|features|
+------------------+-------+--------+
| 63.15243130447404|     56|   [1.0]|
|147.15720373906646|    189|   [4.9]|
|  157.927046358886|    243|   [5.4]|
|179.46673159852506|    192|   [6.4]|
|201.00641683816414|    193|   [7.4]|
+------------------+-------+--------+
only showing top 5 rows



In [0]:
# Calculate Confusion Matrix:

from pyspark.sql.functions import expr

predictions.groupBy("Revenue", "prediction").count().show()

+-------+------------------+-----+
|Revenue|        prediction|count|
+-------+------------------+-----+
|    289| 282.8572207487926|    1|
|    498| 498.2540731451834|    1|
|    527| 569.3350344359923|    1|
|    251|235.46991322158667|    1|
|    402|399.17152104284366|    1|
|    703| 709.3429884936463|    1|
|    189|147.15720373906646|    1|
|    407|394.86358399491576|    1|
|    547|  530.563601004642|    1|
|    661| 640.4159957268013|    1|
|    507|465.94454528572476|    1|
|    570| 565.0270973880645|    1|
|    307|313.01278008428733|    1|
|    534|504.71597871707513|    1|
|    600| 571.4890029599562|    1|
|    357|371.16993023131283|    1|
|    572| 584.4128141037396|    1|
|    416|  414.249300710591|    1|
|    660| 683.4953662060794|    1|
|    564| 605.9524993433787|    1|
+-------+------------------+-----+
only showing top 20 rows



In [0]:
# Save the trained Linear regression model
model_path = "./Internship_Sem-6_models/Ice_cream_Revenue_Prediction_model"
lr_model.save(model_path)

In [0]:
dbutils.fs.ls("dbfs:/Internship_Sem-6_models/Ice_cream_Revenue_Prediction_model")

[FileInfo(path='dbfs:/Internship_Sem-6_models/Ice_cream_Revenue_Prediction_model/data/', name='data/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/Internship_Sem-6_models/Ice_cream_Revenue_Prediction_model/metadata/', name='metadata/', size=0, modificationTime=0)]