<a href="https://colab.research.google.com/github/JosephHobbs9292/Project_One/blob/main/Project_1_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## Importing Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.tree import plot_tree

from sklearn import set_config
set_config(display='diagram')

In [2]:
## Mounting Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
## Load in dataset
df_sales = pd.read_csv("/content/sales_predictions.csv")
df_sales.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
## Checking duplicates
df_sales.duplicated().sum()

0

In [5]:
## Displaying summary statistics of numeric features

df_sales.describe().round(2)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.86,0.07,140.99,1997.83,2181.29
std,4.64,0.05,62.28,8.37,1706.5
min,4.56,0.0,31.29,1985.0,33.29
25%,8.77,0.03,93.83,1987.0,834.25
50%,12.6,0.05,143.01,1999.0,1794.33
75%,16.85,0.09,185.64,2004.0,3101.3
max,21.35,0.33,266.89,2009.0,13086.96


---

Finding and Replacing Bad Values

---

In [6]:
## Checking for bad values
df_sales["Item_Fat_Content"].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [7]:
df_sales.replace(["LF", "reg", "low fat"], ["Low Fat", "Regular", "Low Fat"], inplace = True)

In [8]:
df_sales["Outlet_Size"].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [9]:
df_sales["Outlet_Location_Type"].value_counts()

Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64

In [10]:
df_sales["Outlet_Type"].value_counts()

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

In [11]:
df_sales["Item_Type"].value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [12]:
## Spliting the data and running the train test split
X = df_sales[["Item_Weight", "Outlet_Establishment_Year", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type", "Item_Fat_Content", "Item_Visibility", "Item_Type", "Item_MRP"]].copy()
y = df_sales["Item_Outlet_Sales"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
## Creating the ordinal lists in order of least to most
size_labels = ["Small", "Medium", "High"]
outlet_location_type_labels = ["Tier 1", "Tier 2", "Tier 3"]
ordered_labels = [size_labels, outlet_location_type_labels]

ordinal = OrdinalEncoder(categories = ordered_labels)

In [14]:
## Organizing columns
ordinal_columns = ["Outlet_Size", "Outlet_Location_Type"]
numerical_columns = ["Item_Weight", "Outlet_Establishment_Year", "Item_Visibility", "Item_MRP"]
nominal_columns = ["Item_Fat_Content", "Outlet_Type", "Item_Fat_Content", "Item_Type"]

In [15]:
## Creating transformers 
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

mean_imputer = SimpleImputer(strategy='mean')
freq_imputer = SimpleImputer(strategy='most_frequent')
missing_imputer = SimpleImputer(strategy='constant', fill_value='missing')

In [16]:
## Creating pipelines
num_pipeline = make_pipeline(mean_imputer, scaler)
ord_pipeline = make_pipeline(freq_imputer, ordinal)
nom_pipelne = make_pipeline(missing_imputer, ohe)

In [17]:
## Creating tuples 
ordinal_tuple = (ord_pipeline, ordinal_columns)
numerical_tuple = (num_pipeline, numerical_columns)
nominal_tuple = (nom_pipelne, nominal_columns)

In [18]:
## Column transformer
preprocessor = make_column_transformer(ordinal_tuple, numerical_tuple, nominal_tuple, remainder='drop')
preprocessor.fit(X_train)

In [19]:
## Linear regression model
reg = LinearRegression()
regression_pipeline = make_pipeline(preprocessor, reg)
regression_pipeline.fit(X_train, y_train)

In [20]:
## Predictions 
regression_train = regression_pipeline.predict(X_train)
regression_test = regression_pipeline.predict(X_test)

In [21]:
## Finding the RMSE
train_MSE = mean_squared_error(y_train, regression_train) 
test_MSE = mean_squared_error(y_test, regression_test)

train_RMSE = np.sqrt(train_MSE) 
test_RMSE = np.sqrt(test_MSE)
print(f'Model Training RMSE is: {train_RMSE}')
print(f'Model Testing RMSE is: {test_RMSE}')


Model Training RMSE is: 1140.6036644547933
Model Testing RMSE is: 1094.1948182670665


In [22]:
## Finding the r^2 value
train_r2 = r2_score(y_train, regression_train)
test_r2 = r2_score(y_test, regression_test)
print(f'Model Training r2 is: {train_r2}')
print(f'Model Testing r2 is: {test_r2}')

Model Training r2 is: 0.5604000028188199
Model Testing r2 is: 0.5660485116663269


---
## Liner Regression Results

The model had extremely low variance, but unfortunately didn’t have low enough bias to be effective for making predictions. The results are likely overfitted and the model needs to introduce more variation in order to improve bias. 

---

In [33]:
## Creating a decision tree model
dec_tree = DecisionTreeRegressor(max_depth = None, random_state = 42)
dec_tree_pipe = make_pipeline(preprocessor, dec_tree)
dec_tree_pipe.fit(X_train, y_train)

In [34]:
max_depth = dec_tree_pipe['decisiontreeregressor'].get_depth()

In [35]:
depths = range(1, max_depth+1)
scores = pd.DataFrame(columns=['Train Score', 'Test Score'], index=depths)

for depth in depths:

  dec_tree = DecisionTreeRegressor(max_depth=depth, random_state = 42)
  dec_tree_pipe = make_pipeline(preprocessor, dec_tree)
  dec_tree_pipe.fit(X_train, y_train)

  train_pred = dec_tree_pipe.predict(X_train)
  test_pred = dec_tree_pipe.predict(X_test)

  train_r2score = r2_score(y_train, train_pred)
  test_r2score = r2_score(y_test, test_pred)

  scores.loc[depth, 'Train Score'] = train_r2score
  scores.loc[depth, 'Test Score'] = test_r2score

In [36]:
## Displaying 5 best scores
sorted_scores = scores.sort_values(by='Test Score', ascending=False)

sorted_scores.head()

Unnamed: 0,Train Score,Test Score
5,0.60394,0.59471
4,0.582625,0.584005
6,0.615072,0.582356
7,0.626454,0.576544
8,0.642714,0.560843


In [41]:
## Creating a tuned decision tree model
dec_tree_tuned = DecisionTreeRegressor(max_depth = 5, random_state = 42)
dec_tree_pipe_tuned = make_pipeline(preprocessor, dec_tree_tuned)
dec_tree_pipe_tuned.fit(X_train, y_train)

In [42]:
## Predictions for the tuned decision tree
dec_tree_train = dec_tree_pipe_tuned.predict(X_train) 
dec_tree_test = dec_tree_pipe_tuned.predict(X_test)


In [43]:
## Calculating RMSE for the decision tree
tree_train_MSE = mean_squared_error(y_train, dec_tree_train) 
tree_test_MSE = mean_squared_error(y_test, dec_tree_test)

tree_train_RMSE = np.sqrt(tree_train_MSE) 
tree_test_RMSE = np.sqrt(tree_test_MSE)
print(f'Model Training RMSE is: {tree_train_MSE}')
print(f'Model Testing RMSE is: {tree_test_MSE}')

Model Training RMSE is: 1172122.7729098853
Model Testing RMSE is: 1118185.973077762


In [44]:
tree_train_r2 = r2_score(y_train, dec_tree_train)
tree_test_r2 = r2_score(y_test, dec_tree_test)
print(f'Model Training r2 is: {tree_train_r2}')
print(f'Model Testing r2 is: {tree_test_r2}')

Model Training r2 is: 0.6039397477322956
Model Testing r2 is: 0.5947099753159972


---
## Regression Tree Results

The model had extremely low variance, but had slightly better bias than the regression analysis model. At 60% the perdictive power of the model is still relatively low, but can be useful. 

---

--- 

## Model Selection 

Overall, the Regression Tree is the model I would reccomend. It's bias is 5% better while sacrificing only a increase in the amount of variance. 

---