In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

df_train['Source'] = 'train'
df_test['Source'] = 'test'
df_test['Item_Outlet_Sales'] = 0

In [4]:
df = pd.concat([df_train,df_test],axis=0,ignore_index=True)
print(df_train.shape)
print(df_test.shape)
print(df.shape)
df.head(2)

(8523, 13)
(5681, 13)
(14204, 13)


Unnamed: 0,Item_Fat_Content,Item_Identifier,Item_MRP,Item_Outlet_Sales,Item_Type,Item_Visibility,Item_Weight,Outlet_Establishment_Year,Outlet_Identifier,Outlet_Location_Type,Outlet_Size,Outlet_Type,Source
0,Low Fat,FDA15,249.8092,3735.138,Dairy,0.016047,9.3,1999,OUT049,Tier 1,Medium,Supermarket Type1,train
1,Regular,DRC01,48.2692,443.4228,Soft Drinks,0.019278,5.92,2009,OUT018,Tier 3,Medium,Supermarket Type2,train


##  Part 1: Data Cleansing and Preparation

### 1. Handling casing errors in Fat Content feature

In [5]:
def prepare_item_fat_content(df):
    df['Item_Fat_Content'] = df['Item_Fat_Content'].astype(str)
    df['Item_Fat_Content'] = np.where(df.Item_Fat_Content == 'low fat','Low Fat',df.Item_Fat_Content)
    df['Item_Fat_Content'] = np.where(df.Item_Fat_Content == 'LF','Low Fat',df.Item_Fat_Content)
    df['Item_Fat_Content'] = np.where(df.Item_Fat_Content == 'reg','Regular',df.Item_Fat_Content)
    return df

### 2. Handling missing values in Item Weight feature

In [None]:
def prepare_item_weight(df):
    missing_weigths = df[df.Item_Weight.isnull()]
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Dairy',13.4,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Soft Drinks',11.8,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Meat',12.8,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Fruits and Vegetables',13.10,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Household',10,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Baking Goods',12.2,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Snack Foods',13,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Breakfast',9,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Health and Hygiene',13.14,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Hard Drinks',10.5,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Canned',12.3,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Starchy Foods',13.1,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Others',13.85,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Seafood',11.55,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Frozen Foods',12.86,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Breads',11.34,missing_weigths['Item_Weight'])
    df.loc[missing_weigths.index,'Item_Weight'] = missing_weigths['Item_Weight']
    return df

In [6]:
#Another method of imputing
def fill_item_weights(df):
    item_avg_weight = df.pivot_table(values='Item_Weight', index='Item_Identifier')
    miss_bool = df['Item_Weight'].isnull()
    df.loc[miss_bool,'Item_Weight'] = df.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight.loc[x])
    return df

### 3. Finding Outlet Age

In [7]:
def find_outlet_age(df):
    df['Outlet_Age'] = 2013 - df['Outlet_Establishment_Year']
    df.drop('Outlet_Establishment_Year',axis=1,inplace=True)
    return df

### 4. Handling Missing Values in Outlet Size

In [8]:
def prepare_outlet_size(df):
    df['Outlet_Size'] = np.where(df.Outlet_Type == 'Grocery Store','Small',df.Outlet_Size)
    df['Outlet_Size'] = np.where((df.Outlet_Type == 'Supermarket Type1') & (df.Outlet_Location_Type == 'Tier 2'),'Small',df.Outlet_Size)
    return df

### 5. Removing Unique Identifiers

In [9]:
def drop_ids(df,ids):
    df = df.drop(ids,axis=1)
    return df

### 6. Converting Dummy Variables

In [10]:
def encode_variables(df,columns):
    df_dummies = pd.get_dummies(df[columns], drop_first=True)
    df = df.drop(columns,axis=1)
    return pd.concat([df,df_dummies],axis=1)

### 7. Handling Skewness in Target Variable

In [11]:
def handle_skewness(df):
    df['Item_Outlet_Sales'] = np.sqrt(df_train['Item_Outlet_Sales'])
    return df

In [12]:
#Handling Item visibilty
def handle_visibilty(df):
    avg_visibility = df.pivot_table(values='Item_Visibility',index='Item_Identifier')
    miss_bool = (df['Item_Visibility'] == 0)
    df.loc[miss_bool,'Item_Visibility'] = df.loc[miss_bool,'Item_Identifier'].apply(lambda x: avg_visibility.loc[x])
    
    return df

### 8. Scaling Features

In [28]:
def scale_data(df,scaler):
    df_cols = df.columns
    for f in df.columns if f in ['Item_MRP','Item_Outlet_Sales','Item_Visibility','Item_Weight']
    numeric_features = [f for f in df.columns if f in ['Item_MRP','Item_Outlet_Sales','Item_Visibility','Item_Weight']]
    print(numeric_features)
    numeric_df = df.iloc[:,numeric_features]
    #print(numeric_df.head(2))
    column_transform = numeric_df.columns
    df.loc[:, column_transform] = scaler.fit_transform(df.loc[:, column_transform])
    
    return df

In [14]:
df = prepare_item_fat_content(df)

In [15]:
df = fill_item_weights(df)
df.head()

Unnamed: 0,Item_Fat_Content,Item_Identifier,Item_MRP,Item_Outlet_Sales,Item_Type,Item_Visibility,Item_Weight,Outlet_Establishment_Year,Outlet_Identifier,Outlet_Location_Type,Outlet_Size,Outlet_Type,Source
0,Low Fat,FDA15,249.8092,3735.138,Dairy,0.016047,9.3,1999,OUT049,Tier 1,Medium,Supermarket Type1,train
1,Regular,DRC01,48.2692,443.4228,Soft Drinks,0.019278,5.92,2009,OUT018,Tier 3,Medium,Supermarket Type2,train
2,Low Fat,FDN15,141.618,2097.27,Meat,0.01676,17.5,1999,OUT049,Tier 1,Medium,Supermarket Type1,train
3,Regular,FDX07,182.095,732.38,Fruits and Vegetables,0.0,19.2,1998,OUT010,Tier 3,,Grocery Store,train
4,Low Fat,NCD19,53.8614,994.7052,Household,0.0,8.93,1987,OUT013,Tier 3,High,Supermarket Type1,train


In [16]:
df = find_outlet_age(df)

In [17]:
df = prepare_outlet_size(df)

In [18]:
#df = handle_skewness(df)
df = handle_visibilty(df)

In [20]:
df = encode_variables(df,['Item_Fat_Content','Item_Type','Outlet_Size','Outlet_Location_Type','Outlet_Type'])
df.head()

Unnamed: 0,Item_Identifier,Item_MRP,Item_Outlet_Sales,Item_Visibility,Item_Weight,Outlet_Identifier,Source,Outlet_Age,Item_Fat_Content_Regular,Item_Type_Breads,...,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,FDA15,249.8092,3735.138,0.016047,9.3,OUT049,train,14,0,0,...,0,0,0,1,0,0,0,1,0,0
1,DRC01,48.2692,443.4228,0.019278,5.92,OUT018,train,4,1,0,...,0,1,0,1,0,0,1,0,1,0
2,FDN15,141.618,2097.27,0.01676,17.5,OUT049,train,14,0,0,...,0,0,0,1,0,0,0,1,0,0
3,FDX07,182.095,732.38,0.017834,19.2,OUT010,train,15,1,0,...,0,0,0,0,1,0,1,0,0,0
4,NCD19,53.8614,994.7052,0.00978,8.93,OUT013,train,26,0,0,...,0,0,0,0,0,0,1,1,0,0


In [29]:
df = scale_data(df,StandardScaler())

['Item_MRP', 'Item_Outlet_Sales', 'Item_Visibility', 'Item_Weight']


TypeError: cannot perform reduce with flexible type

In [24]:
df = drop_ids(df,['Item_Identifier','Outlet_Identifier'])

In [25]:
df.head()

Unnamed: 0,Item_MRP,Item_Outlet_Sales,Item_Visibility,Item_Weight,Source,Outlet_Age,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,249.8092,3735.138,0.016047,9.3,train,14,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,48.2692,443.4228,0.019278,5.92,train,4,1,0,0,0,...,0,1,0,1,0,0,1,0,1,0
2,141.618,2097.27,0.01676,17.5,train,14,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,182.095,732.38,0.017834,19.2,train,15,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
4,53.8614,994.7052,0.00978,8.93,train,26,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [None]:
df.to_csv('out/train_final.csv')

## Part 2:  Model Selection

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [None]:
y = df['Item_Outlet_Sales']
X = df.drop('Item_Outlet_Sales',axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

### 1. Decision Tree with Train Test Split

In [None]:
dtreg = DecisionTreeRegressor()
dtreg.fit(X_train, y_train)
y_pred = dtreg.predict(X_test)

print np.sqrt(mean_squared_error(y_pred ,y_test))

### 2. Decision Tree with 5 fold CV

In [None]:
scores = cross_val_score(dtreg, X, y, cv=5,scoring='neg_mean_squared_error')
scores = scores * -1
for i, score in enumerate(scores):
    print "CV No: ", i
    print np.sqrt(score)

### 3. Random Forest with Train Test Split

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfreg = RandomForestRegressor()
rfreg.fit(X_train, y_train)
y_pred = rfreg.predict(X_test)
print np.sqrt(mean_squared_error(y_pred,y_test))

### 4. Random Forest with 5 fold CV

In [None]:
scores = cross_val_score(rfreg, X, y, cv=5,scoring='neg_mean_squared_error')
scores = scores * -1
for i, score in enumerate(scores):
    print "CV No: ", i
    print np.sqrt(score)