In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

##  Part 1: Data Cleansing and Preparation

### 1. Handling casing errors in Fat Content feature

In [3]:
def prepare_item_fat_content(df):
    df['Item_Fat_Content'] = df['Item_Fat_Content'].astype(str)
    df['Item_Fat_Content'] = np.where(df.Item_Fat_Content == 'low fat','Low Fat',df.Item_Fat_Content)
    df['Item_Fat_Content'] = np.where(df.Item_Fat_Content == 'LF','Low Fat',df.Item_Fat_Content)
    df['Item_Fat_Content'] = np.where(df.Item_Fat_Content == 'reg','Regular',df.Item_Fat_Content)
    return df

### 2. Handling missing values in Item Weight feature

In [4]:
def prepare_item_weight(df):
    missing_weigths = df[df.Item_Weight.isnull()]
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Dairy',13.4,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Soft Drinks',11.8,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Meat',12.8,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Fruits and Vegetables',13.10,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Household',10,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Baking Goods',12.2,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Snack Foods',13,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Breakfast',9,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Health and Hygiene',13.14,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Hard Drinks',10.5,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Canned',12.3,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Starchy Foods',13.1,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Others',13.85,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Seafood',11.55,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Frozen Foods',12.86,missing_weigths['Item_Weight'])
    missing_weigths.loc[:,'Item_Weight'] = np.where(missing_weigths['Item_Type'] == 'Breads',11.34,missing_weigths['Item_Weight'])
    df.loc[missing_weigths.index,'Item_Weight'] = missing_weigths['Item_Weight']
    return df

### 3. Finding Outlet Age

In [5]:
def find_outlet_age(df):
    df['Outlet_Age'] = 2015 - df['Outlet_Establishment_Year']
    df.drop('Outlet_Establishment_Year',axis=1,inplace=True)
    return df

### 4. Handling Missing Values in Outlet Size

In [6]:
def prepare_outlet_size(df):
    df['Outlet_Size'] = np.where(df.Outlet_Type == 'Grocery Store','Small',df.Outlet_Size)
    df['Outlet_Size'] = np.where((df.Outlet_Type == 'Supermarket Type1') & (df.Outlet_Location_Type == 'Tier 2'),'Small',df.Outlet_Size)
    return df

### 5. Removing Unique Identifiers

In [7]:
def drop_ids(df,ids):
    df = df.drop(ids,axis=1)
    return df

### 6. Converting Dummy Variables

In [8]:
def encode_variables(df,columns):
    df_dummies = pd.get_dummies(df[columns])
    df = df.drop(columns,axis=1)
    return pd.concat([df,df_dummies],axis=1)

In [9]:
df = prepare_item_fat_content(df_train)

In [10]:
df = prepare_item_weight(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
df = find_outlet_age(df)

In [12]:
df = prepare_outlet_size(df)

In [13]:
df = drop_ids(df,['Item_Identifier','Outlet_Identifier'])

In [14]:
df = encode_variables(df,['Item_Fat_Content','Item_Type','Outlet_Size','Outlet_Location_Type','Outlet_Type'])

In [15]:
df.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales,Outlet_Age,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,3735.138,16,1,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,5.92,0.019278,48.2692,443.4228,6,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,17.5,0.01676,141.618,2097.27,16,1,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,19.2,0.0,182.095,732.38,17,0,1,0,0,0,...,0,0,1,0,0,1,1,0,0,0
4,8.93,0.0,53.8614,994.7052,28,1,0,0,0,0,...,1,0,0,0,0,1,0,1,0,0


## Part 2:  Model Selection

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [17]:
y = df['Item_Outlet_Sales']
X = df.drop('Item_Outlet_Sales',axis=1)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

### 1. Decision Tree with Train Test Split

In [19]:
dtreg = DecisionTreeRegressor()
dtreg.fit(X_train, y_train)
y_pred = dtreg.predict(X_test)

print np.sqrt(mean_squared_error(y_pred,y_test))

1584.73260071


### 2. Decision Tree with 5 fold CV

In [20]:
scores = cross_val_score(dtreg, X, y, cv=5,scoring='neg_mean_squared_error')
scores = scores * -1
for i, score in enumerate(scores):
    print "CV No: ", i
    print np.sqrt(score)

CV No:  0
1563.47297145
CV No:  1
1564.89780906
CV No:  2
1598.8706133
CV No:  3
1537.1783412
CV No:  4
1555.49267678


### 3. Random Forest with Train Test Split

In [21]:
from sklearn.ensemble import RandomForestRegressor
rfreg = RandomForestRegressor()
rfreg.fit(X_train, y_train)
y_pred = rfreg.predict(X_test)
print np.sqrt(mean_squared_error(y_pred,y_test))

1200.56286476


### 4. Random Forest with 5 fold CV

In [22]:
scores = cross_val_score(rfreg, X, y, cv=5,scoring='neg_mean_squared_error')
scores = scores * -1
for i, score in enumerate(scores):
    print "CV No: ", i
    print np.sqrt(score)

CV No:  0
1185.42859667
CV No:  1
1180.89872
CV No:  2
1195.22240938
CV No:  3
1181.84847575
CV No:  4
1207.48209377
