In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor



In [3]:
train_set = pd.read_csv("/Users/manumitha/Desktop/UNT/ML/train.csv")



In [68]:
train_set.apply(lambda x: len(x.unique()))



Item_Identifier              1
Item_Weight                  1
Item_Fat_Content             2
Item_Visibility              2
Item_Type                    1
Item_MRP                     2
Outlet_Identifier            2
Outlet_Establishment_Year    2
Outlet_Size                  2
Outlet_Location_Type         2
Outlet_Type                  1
Item_Outlet_Sales            1
dtype: int64

In [9]:
# make a copy of the data
df = train_set.copy()
# format column names to be lower-case
new_col_names = [col.lower() for col in df.columns]
df.columns = new_col_names

print(df.columns)

Index(['item_identifier', 'item_weight', 'item_fat_content', 'item_visibility',
       'item_type', 'item_mrp', 'outlet_identifier',
       'outlet_establishment_year', 'outlet_size', 'outlet_location_type',
       'outlet_type', 'item_outlet_sales'],
      dtype='object')


In [10]:
df['item_weight'].fillna(df['item_weight'].mean(), inplace=True)
df.isna().sum()

item_identifier                 0
item_weight                     0
item_fat_content                0
item_visibility                 0
item_type                       0
item_mrp                        0
outlet_identifier               0
outlet_establishment_year       0
outlet_size                  2410
outlet_location_type            0
outlet_type                     0
item_outlet_sales               0
dtype: int64

In [11]:
outlet_size_mode_pt = df.pivot_table(values='outlet_size',
                                                      columns='outlet_type',
                                                      aggfunc=lambda x: x.mode())
outlet_size_mode_pt

outlet_type,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
outlet_size,Small,Small,Medium,Medium


In [12]:
missing_values = df['outlet_size'].isnull()
df.loc[missing_values, 'outlet_size'] = df.loc[missing_values, 'outlet_type'].apply(lambda x: outlet_size_mode_pt[x].outlet_size)
df.isna().sum()

item_identifier              0
item_weight                  0
item_fat_content             0
item_visibility              0
item_type                    0
item_mrp                     0
outlet_identifier            0
outlet_establishment_year    0
outlet_size                  0
outlet_location_type         0
outlet_type                  0
item_outlet_sales            0
dtype: int64

In [13]:
# print total number of 0s
print('Total of 0s before replace: ', sum(df['item_visibility'] == 0))

# replace 0s with the mean
df.loc[:,'item_visibility'].replace(to_replace=0,
                                            value=df['item_visibility'].mean(),
                                            inplace=True)

# print total number of 0s after the replace
print('Total of 0s after replace: ', sum(df['item_visibility'] == 0))

Total of 0s before replace:  526
Total of 0s after replace:  0


In [14]:
# print values
print(df['item_fat_content'].unique())

# replace the repetitive values
df['item_fat_content'].replace({'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'}, inplace=True)
df['item_fat_content'].value_counts()

['Low Fat' 'Regular' 'low fat' 'LF' 'reg']


Low Fat    5517
Regular    3006
Name: item_fat_content, dtype: int64

In [15]:
# create a new feature using the two first letters of the item_identifier
df['item_category'] = df['item_identifier'].apply(lambda x: x[:2])
df['item_category'] = df['item_category'].replace({'FD':'Food', 'DR':'Drink', 'NC':'Non-Consumable'})
df['item_category'].value_counts()

Food              6125
Non-Consumable    1599
Drink              799
Name: item_category, dtype: int64

In [16]:
# adding a new item item_fat_content category for non-consumable items
df.loc[df['item_category'] == 'Non-Consumable', 'item_fat_content'] = 'No Edible'
df['item_fat_content'].value_counts()

Low Fat      3918
Regular      3006
No Edible    1599
Name: item_fat_content, dtype: int64

In [17]:
# the new feature tell us how old is the outlet
df['outlet_years'] = 2013 - df['outlet_establishment_year']
df['outlet_years']


0       14
1        4
2       14
3       15
4       26
        ..
8518    26
8519    11
8520     9
8521     4
8522    16
Name: outlet_years, Length: 8523, dtype: int64

In [18]:
# how the data looks like
df.head()

Unnamed: 0,item_identifier,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales,item_category,outlet_years
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Food,14
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Drink,4
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,Food,14
3,FDX07,19.2,Regular,0.066132,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38,Food,15
4,NCD19,8.93,No Edible,0.066132,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,Non-Consumable,26


In [19]:
ms=df.loc[df['item_identifier'] == 'FDW58'].mean()

print(ms)

item_weight                    20.750000
item_visibility                 0.007549
item_mrp                      105.662200
outlet_establishment_year    1995.500000
item_outlet_sales            1693.795200
outlet_years                   17.500000
dtype: float64


  ms=df.loc[df['item_identifier'] == 'FDW58'].mean()


In [20]:
# applying label encoding to some features
encoder = LabelEncoder()

cols_to_encode = ['item_identifier', 'item_type', 'outlet_identifier']

for col in cols_to_encode:
    df[col] = encoder.fit_transform(df[col])

# applying one-hot encoding to some features
df = pd.get_dummies(df, columns=['item_fat_content', 'outlet_size', 'outlet_location_type', 'outlet_type', 'item_category'])

# how the data looks like
df.head()

Unnamed: 0,item_identifier,item_weight,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,item_outlet_sales,outlet_years,item_fat_content_Low Fat,...,outlet_location_type_Tier 1,outlet_location_type_Tier 2,outlet_location_type_Tier 3,outlet_type_Grocery Store,outlet_type_Supermarket Type1,outlet_type_Supermarket Type2,outlet_type_Supermarket Type3,item_category_Drink,item_category_Food,item_category_Non-Consumable
0,156,9.3,0.016047,4,249.8092,9,1999,3735.138,14,1,...,1,0,0,0,1,0,0,0,1,0
1,8,5.92,0.019278,14,48.2692,3,2009,443.4228,4,0,...,0,0,1,0,0,1,0,1,0,0
2,662,17.5,0.01676,10,141.618,9,1999,2097.27,14,1,...,1,0,0,0,1,0,0,0,1,0
3,1121,19.2,0.066132,6,182.095,0,1998,732.38,15,0,...,0,0,1,1,0,0,0,0,1,0
4,1297,8.93,0.066132,9,53.8614,1,1987,994.7052,26,0,...,0,0,1,0,1,0,0,0,0,1


In [21]:
X = df.drop(columns=['outlet_establishment_year', 'item_outlet_sales'])
y = df['item_outlet_sales']
X

Unnamed: 0,item_identifier,item_weight,item_visibility,item_type,item_mrp,outlet_identifier,outlet_years,item_fat_content_Low Fat,item_fat_content_No Edible,item_fat_content_Regular,...,outlet_location_type_Tier 1,outlet_location_type_Tier 2,outlet_location_type_Tier 3,outlet_type_Grocery Store,outlet_type_Supermarket Type1,outlet_type_Supermarket Type2,outlet_type_Supermarket Type3,item_category_Drink,item_category_Food,item_category_Non-Consumable
0,156,9.300,0.016047,4,249.8092,9,14,1,0,0,...,1,0,0,0,1,0,0,0,1,0
1,8,5.920,0.019278,14,48.2692,3,4,0,0,1,...,0,0,1,0,0,1,0,1,0,0
2,662,17.500,0.016760,10,141.6180,9,14,1,0,0,...,1,0,0,0,1,0,0,0,1,0
3,1121,19.200,0.066132,6,182.0950,0,15,0,0,1,...,0,0,1,1,0,0,0,0,1,0
4,1297,8.930,0.066132,9,53.8614,1,26,0,1,0,...,0,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,370,6.865,0.056783,13,214.5218,1,26,1,0,0,...,0,0,1,0,1,0,0,0,1,0
8519,897,8.380,0.046982,0,108.1570,7,11,0,0,1,...,0,1,0,0,1,0,0,0,1,0
8520,1357,10.600,0.035186,8,85.1224,6,9,0,1,0,...,0,1,0,0,1,0,0,0,0,1
8521,681,7.210,0.145221,13,103.1332,3,4,0,0,1,...,0,0,1,0,0,1,0,0,1,0


In [23]:
y

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: item_outlet_sales, Length: 8523, dtype: float64

In [64]:
# linear regression (Baseline model)
# Evaluation metrices used are RMSE, R Square
X_train, X_valid, y_train, y_valid= train_test_split(X,y,test_size=0.3, random_state=42) 
model1=LinearRegression()
model1.fit(X_train, y_train)
y_pred=model1.predict(X_valid) 
rmse=np.sqrt(mean_squared_error(y_pred,y_valid)) 
r2=r2_score(y_pred,y_valid)
print("RMSE Score : ", rmse)
print("R2 Square score : ", r2)

RMSE Score :  1097.7772119493848
R2 Square score :  0.300496915058017


In [43]:
# ridge regression
# Evaluation metrices used are RMSE, R Square
X_train, X_valid, y_train, y_valid= train_test_split(X,y,test_size=0.3, random_state=42) 
model2=Ridge()
model2.fit(X_train, y_train)
y_pred=model2.predict(X_valid) 
rmse=np.sqrt(mean_squared_error(y_pred,y_valid)) 
r2=r2_score(y_pred,y_valid)
print("RMSE Score : ", rmse)
print("R2 Square score : ", r2)

RMSE Score :  1098.7819683577766
R2 Square score :  0.29775180949467595


In [44]:
# Tuning Ridge regression model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

alphas = [0.01, 0.1, 1, 10, 100]


grid_search = GridSearchCV(model2, param_grid={'alpha': alphas}, cv=5)

grid_search.fit(X_train, y_train)

best_alpha = grid_search.best_params_['alpha']

model_2_tuned = Ridge(alpha=best_alpha)
model_2_tuned.fit(X_train, y_train)

y_pred = model_2_tuned.predict(X_valid)

rmse = np.sqrt(mean_squared_error(y_pred,y_valid)) 
r2 = r2_score(y_pred,y_valid)

print("Best Alpha: ", best_alpha)
print("RMSE Score : ", rmse)
print("R2 Square score : ", r2)

Best Alpha:  0.1
RMSE Score :  1097.975093976352
R2 Square score :  0.2998977574953239


In [46]:
# Lasso regression
# Evaluation metrices used are RMSE, R Square
X_train, X_valid, y_train, y_valid= train_test_split(X,y,test_size=0.3, random_state=42) 
model3=Lasso()
model3.fit(X_train, y_train)
y_pred=model3.predict(X_valid) 
rmse=np.sqrt(mean_squared_error(y_pred,y_valid)) 
r2=r2_score(y_pred,y_valid)
print("RMSE Score : ", rmse)
print("R2 Square score : ", r2)

RMSE Score :  1098.7577415665482
R2 Square score :  0.294980684812694


  model = cd_fast.enet_coordinate_descent(


In [69]:
# Tuning Lasso regression model

from sklearn.linear_model import LassoCV

alphas = [0.1, 1, 10] # different alpha values to test
model3_tuned = LassoCV(alphas=alphas, cv=5) # perform 5-fold cross-validation
model3_tuned.fit(X_train, y_train)
best_alpha = model3_tuned.alpha_
y_pred = model3_tuned.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_pred, y_valid))
r2 = r2_score(y_pred, y_valid)
print("Best alpha: ", best_alpha)
print("RMSE Score : ", rmse)
print("R2 Square score : ", r2)

Best alpha:  0.1
RMSE Score :  1098.745166977685
R2 Square score :  0.29834039429431924


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


In [47]:
# Tuning Lasso regression model

from sklearn.linear_model import LassoCV

alphas = [0.1, 1, 10] # different alpha values to test
model3_tuned = LassoCV(alphas=alphas, cv=5) # perform 5-fold cross-validation
model3_tuned.fit(X_train, y_train)
best_alpha = model3_tuned.alpha_
y_pred = model3_tuned.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_pred, y_valid))
r2 = r2_score(y_pred, y_valid)
print("Best alpha: ", best_alpha)
print("RMSE Score : ", rmse)
print("R2 Square score : ", r2)

Best alpha:  0.1
RMSE Score :  1098.745166977685
R2 Square score :  0.29834039429431924


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


In [49]:
# Decision tree model and tuning od decision tree model

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV


model5 = DecisionTreeRegressor(random_state=42)

model5.fit(X_train, y_train)

y_pred = model5.predict(X_valid)

rmse = np.sqrt(mean_squared_error(y_pred, y_valid))
r2 = r2_score(y_pred, y_valid)
print("RMSE Score: ", rmse)
print("R2 Square Score: ", r2)

# Tuning decision tree model

param_grid = {
    'max_depth': [3, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(model5, param_grid, scoring='neg_mean_squared_error', cv=5)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)

model5_tuned = DecisionTreeRegressor(**grid_search.best_params_, random_state=42)
model5_tuned.fit(X_train, y_train)

y_pred = model5_tuned.predict(X_valid)

rmse = np.sqrt(mean_squared_error(y_pred, y_valid))
r2 = r2_score(y_pred, y_valid)
print("RMSE Score (best model): ", rmse)
print("R2 Square Score (best model): ", r2)

RMSE Score:  1497.8262278494126
R2 Square Score:  0.2936163739335719
Best Parameters:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
RMSE Score (best model):  1363.1693067945448
R2 Square Score (best model):  -0.04664528647482058


In [28]:
model_scores

Unnamed: 0,model,rmse,r2_score
0,Linear Regression,1067.724237,0.580556
1,Ridge,1067.609102,0.580647
2,Lasso,1068.860697,0.579663
3,Decision Tree,1502.576823,0.16933


In [40]:
# loading the test data for Item_Identifier == FDW58
test_data=pd.read_csv('/Users/manumitha/Desktop/UNT/ML/test.csv')
test_data=test_data.loc[test_data['Item_Identifier'] == 'FDW58']
test_data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
253,FDW58,20.75,Low Fat,0.007596,Snack Foods,104.4622,OUT017,2007,,Tier 2,Supermarket Type1
1625,FDW58,20.75,Low Fat,0.007584,Snack Foods,107.0622,OUT018,2009,Medium,Tier 3,Supermarket Type2
1892,FDW58,20.75,Low Fat,0.0,Snack Foods,105.9622,OUT046,1997,Small,Tier 1,Supermarket Type1
2166,FDW58,20.75,Low Fat,0.007568,Snack Foods,105.8622,OUT045,2002,,Tier 2,Supermarket Type1
2434,FDW58,,Low Fat,0.007517,Snack Foods,107.6622,OUT027,1985,Medium,Tier 3,Supermarket Type3
2830,FDW58,,Low Fat,0.013224,Snack Foods,106.4622,OUT019,1985,Small,Tier 1,Grocery Store


In [41]:
# Feature engineering by filling null values, missing values
new_col_names = [col.lower() for col in test_data.columns]
test_data.columns = new_col_names

results = test_data[['item_identifier', 'outlet_identifier']]

test_data['item_weight'].fillna(test_data['item_weight'].mean(), inplace=True)

missing_values = test_data['outlet_size'].isnull()
test_data.loc[missing_values, 'outlet_size'] = test_data.loc[missing_values, 'outlet_type'].apply(lambda x: outlet_size_mode_pt[x].outlet_size)

test_data.loc[:,'item_visibility'].replace(to_replace=0,
                                            value=test_data['item_visibility'].mean(),
                                            inplace=True)

test_data['item_category'] = test_data['item_identifier'].apply(lambda x: x[:2])
test_data['item_category'] = test_data['item_category'].replace({'FD':'Food', 'DR':'Drink', 'NC':'Non-Consumable'})

test_data['outlet_years'] = 2013 - test_data['outlet_establishment_year']

# applying one-hot encoding to some features
test_data = pd.get_dummies(test_data, columns=['item_fat_content', 'outlet_size', 'outlet_location_type', 'outlet_type', 'item_category'])

test_data['item_identifier'] = test_data['item_identifier'].replace({'FDW58': 1114})
test_data['item_type'] = test_data['item_type'].replace({'Snack Foods': 13})
test_data['outlet_identifier'] = test_data['outlet_identifier'].replace({'OUT049': 9,
                                                                         'OUT017': 2,
                                                                         'OUT018': 3,
                                                                         'OUT046': 8,
                                                                         'OUT045': 7,
                                                                         'OUT027': 5,
                                                                         'OUT019': 4})

# adding missing features
test_data['item_fat_content_No Edible'] = 0
test_data['item_fat_content_Regular'] = 0
test_data['outlet_size_High'] = 0
test_data['item_category_Drink'] = 0
test_data['item_category_Non-Consumable'] = 0

test_data = test_data.drop(columns=['outlet_establishment_year'])

# re-ordering columns same as training set
test_data = test_data[['item_identifier',
                       'item_weight',
                       'item_visibility',
                       'item_type',
                       'item_mrp',
                       'outlet_identifier',
                       'outlet_years',
                       'item_fat_content_Low Fat',
                       'item_fat_content_No Edible',
                       'item_fat_content_Regular',
                       'outlet_size_High',
                       'outlet_size_Medium',
                       'outlet_size_Small',
                       'outlet_location_type_Tier 1',
                       'outlet_location_type_Tier 2',
                       'outlet_location_type_Tier 3',
                       'outlet_type_Grocery Store',
                       'outlet_type_Supermarket Type1',
                       'outlet_type_Supermarket Type2',
                       'outlet_type_Supermarket Type3',
                       'item_category_Drink',
                       'item_category_Food',
                       'item_category_Non-Consumable']]



In [50]:
# predicting the testing data using Tuned Decision tree as it has given better results for training set
y_hat = model5_tuned.predict(test_data)

results['prediction'] = y_hat
results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['prediction'] = y_hat


Unnamed: 0,item_identifier,outlet_identifier,prediction
0,FDW58,OUT049,1870.082395
253,FDW58,OUT017,2012.103083
1625,FDW58,OUT018,1207.005773
1892,FDW58,OUT046,1389.565447
2166,FDW58,OUT045,1008.770225
2434,FDW58,OUT027,1661.659253
2830,FDW58,OUT019,131.8284


In [None]:
# The above results are the predicted sales for item identifier - 'FDW58' at various outlets.