<a href="https://colab.research.google.com/github/JQ100/project1/blob/main/project1final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
# import required packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn import set_config

path = '/content/sales_predictions.csv'

df = pd.read_csv(path)
df.head() 

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [41]:
# Drop all object (esp. string) type columns
df = df.select_dtypes(exclude=['object'])

In [42]:
# Drop the other bad columns
df = df.drop(['Outlet_Establishment_Year'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item_Weight        7060 non-null   float64
 1   Item_Visibility    8523 non-null   float64
 2   Item_MRP           8523 non-null   float64
 3   Item_Outlet_Sales  8523 non-null   float64
dtypes: float64(4)
memory usage: 266.5 KB


In [43]:
# Fix all NaN Values
df['Item_Weight'].fillna(-1, inplace=True)

In [44]:
# Define X,y
target = 'Item_Outlet_Sales'
X = df.drop(columns=target)
y = df[target]

## Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)
X_train.shape

(6392, 3)

In [45]:
# Check missing values after train test split
print(df.isna().sum().sum(), 'missing values')

0 missing values


In [46]:
df[df.isna().any(axis=1)].shape # 3873 rows missing at least one value

(0, 4)

In [47]:
#instantiate the selectors to for numeric and categorical data types
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')
#select the numeric columns of each type
num_columns = num_selector(X_train)
cat_columns = cat_selector(X_train)
#check our lists
print('numeric columns are', num_columns)
print('categorical columns are', cat_columns)

numeric columns are ['Item_Weight', 'Item_Visibility', 'Item_MRP']
categorical columns are []


In [48]:
X_train.isna().any() # find all columns that are missing data

Item_Weight        False
Item_Visibility    False
Item_MRP           False
dtype: bool

In [49]:
# Checking Datatypes 

display(X_train.info())
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Item_Weight      6392 non-null   float64
 1   Item_Visibility  6392 non-null   float64
 2   Item_MRP         6392 non-null   float64
dtypes: float64(3)
memory usage: 199.8 KB


None

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP
4776,16.35,0.029565,256.4646
7510,15.25,0.0,179.766
5828,12.35,0.158716,157.2946
5327,7.975,0.014628,82.325
4810,19.35,0.016645,120.9098


In [50]:
#instantiate the selectors to for numeric and categorical data types
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

#instantiate SimpleImputers with most_frequent and median strategies
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')

# Build the transformer
# create tuples of (imputer, selector) for each datatype
num_tuple = (mean_imputer, num_selector)
cat_tuple = (freq_imputer, cat_selector)
# instantiate ColumnTransformer
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')

# Impute missing values with column transformer
# fit ColumnTransformer on the training data
col_transformer.fit(X_train)
# transform both the training and testing data (this will output a NumPy array)
X_train_imputed = col_transformer.transform(X_train)
X_test_imputed = col_transformer.transform(X_test)
# change the result back to a dataframe
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_train_imputed.isna().any()

Item_Weight        False
Item_Visibility    False
Item_MRP           False
dtype: bool

In [51]:
## Instantiate & fit One Hot Encoder the nominal/categorical features
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

X_train_cat = X_train[cat_columns].copy()
X_test_cat = X_test[cat_columns].copy()

ohe_encoder.fit(X_train_cat)

## Transform the categorical data
X_train_ohe = ohe_encoder.transform(X_train_cat)
X_test_ohe = ohe_encoder.transform(X_test_cat)
X_train_ohe

array([], shape=(6392, 0), dtype=float64)

In [52]:
## To get the column names use one of the following (depending if using sklearn >1.0)

ohe_col_names = ohe_encoder.get_feature_names_out(cat_columns) # new sklearn v1 way
ohe_col_names

array([], dtype=object)

In [53]:
## Make a DataFrame with the correct column names 

# Train Data
X_train_ohe_df = pd.DataFrame(X_train_ohe, columns = ohe_col_names)
X_train_ohe_df.reset_index(drop=True, inplace=True)

# Test Data
X_test_ohe_df = pd.DataFrame(X_test_ohe, columns = ohe_col_names)
X_test_ohe_df.reset_index(drop=True, inplace=True)

X_train_ohe_df

0
1
2
3
4
...
6387
6388
6389
6390
6391


In [54]:
# now focus on scaling the numeric data

## Make a column selector for categories & test it /sanity check
num_selector = make_column_selector(dtype_include='number')

num_cols = num_selector(X_train)
num_cols

['Item_Weight', 'Item_Visibility', 'Item_MRP']

In [55]:
## Scale the data with StandardScaler
scaler = StandardScaler()

X_train_num = X_train[num_cols]
X_test_num = X_test[num_cols]

scaler.fit(X_train_num)

## Transform the data.
num_train_scaled = scaler.transform(X_train_num)
num_test_scaled = scaler.transform(X_test_num)

num_train_scaled

array([[ 0.86818714, -0.71277507,  1.82810922],
       [ 0.70503837, -1.29105225,  0.60336888],
       [ 0.27491887,  1.81331864,  0.24454056],
       ...,
       [ 1.05358348, -0.92052713,  1.52302674],
       [ 1.46145542, -0.2277552 , -0.38377708],
       [ 0.86818714, -0.95867683, -0.73836105]])

In [56]:
# Change numeric data into a dataframe.

# Train Data
num_train_scaled = pd.DataFrame(num_train_scaled, columns=num_cols)
num_train_scaled.reset_index(drop=True, inplace=True)

# Test Data
num_test_scaled = pd.DataFrame(num_test_scaled, columns=num_cols)
num_test_scaled.reset_index(drop=True, inplace=True)

In [57]:
## Concat the numeric and ohe data
X_train_df = pd.concat([num_train_scaled, X_train_ohe_df], axis=1)
X_train_df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP
0,0.868187,-0.712775,1.828109
1,0.705038,-1.291052,0.603369
2,0.274919,1.813319,0.244541
3,-0.373968,-1.004931,-0.952591
4,1.313138,-0.965484,-0.336460
...,...,...,...
6387,-0.163358,4.309657,-0.044657
6388,0.756949,1.008625,-1.058907
6389,1.053583,-0.920527,1.523027
6390,1.461455,-0.227755,-0.383777


In [58]:
# Q1 - Linear Regression Model
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

from sklearn.pipeline import make_pipeline
reg_pipe = make_pipeline(scaler, reg)

reg_pipe.fit(X_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

In [59]:
df.duplicated().sum()

0

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item_Weight        8523 non-null   float64
 1   Item_Visibility    8523 non-null   float64
 2   Item_MRP           8523 non-null   float64
 3   Item_Outlet_Sales  8523 non-null   float64
dtypes: float64(4)
memory usage: 266.5 KB


In [61]:
# Create model predictions
train_pred = reg_pipe.predict(X_train)
test_pred = reg_pipe.predict(X_test)

In [62]:
# Calculating R2

train_r2 = np.corrcoef(y_train, train_pred)[0][1]**2
test_r2 = np.corrcoef(y_test, test_pred)[0][1]**2

print(f'Model Training R2: {train_r2}')
print(f'Model Testing R2: {test_r2}')

Model Training R2: 0.34585732096900645
Model Testing R2: 0.33907376491112157


In [63]:
# Calculating RMSE

train_RMSE = np.sqrt(np.mean(np.abs(train_pred - y_train)**2))
test_RMSE = np.sqrt(np.mean(np.abs(test_pred - y_test)**2))

print(f'Model Training RMSE: {train_RMSE}')
print(f'Model Testing RMSE: {test_RMSE}')

Model Training RMSE: 1391.3684928991886
Model Testing RMSE: 1351.5959185664144


In [64]:
# Q2 - Building Simple Regression Tree Model

from sklearn.tree import DecisionTreeRegressor
dec_tree = DecisionTreeRegressor(random_state = 42)
dec_tree.fit(X_train, y_train)
train_preds = dec_tree.predict(X_train)
test_preds = dec_tree.predict(X_test)

In [65]:
# Evaluating performance based on R2

train_score = dec_tree.score(X_train, y_train)
test_score = dec_tree.score(X_test, y_test)
print(train_score)
print(test_score)

1.0
-0.31576006093513964


In [66]:
# Evaluating performance based on RMSE

train_RMSE = np.sqrt(np.mean(np.abs(train_preds - y_train)**2))
test_RMSE = np.sqrt(np.mean(np.abs(test_preds - y_test)**2))

print(f'Model Training RMSE: {train_RMSE}')
print(f'Model Testing RMSE: {test_RMSE}')

Model Training RMSE: 0.0
Model Testing RMSE: 1905.2958101888405


Q3 - I recommend the linear regression model because the evaluations betwen the training and testing data are close to each other. From the results of that model, we can see that our model can account for about 34% of the variation in y_test using the features in X_test. The results for R2 are on a consistent scale. Also, from its RMSE data, we can see larger errors getting punished.

Check README for Q4