Problem Statement
The data scientists at BigMart have collected 2013 sales data for 1559 products across 10 stores in different cities. Also, certain attributes of each product and store have been defined. The aim is to build a predictive model and find out the sales of each product at a particular store.
Using this model, BigMart will try to understand the properties of products and stores which play a key role in increasing sales.

In [313]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df= pd.read_csv('/Data/Train.csv')

In [314]:
df['Item_Fat_Content'].unique()
#Low Fat=low fat= LF .Similarly,Regular=reg

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [315]:
def fat_content(df):
    df['Item_Fat_Content']=df['Item_Fat_Content'].replace(['low fat','LF'],'Low Fat')
    df['Item_Fat_Content']=df['Item_Fat_Content'].replace(['reg'],'Regular')
    df['Item_Fat_Content'].unique()
    return df

In [316]:
#Missing values
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [317]:
#Treating Item_Weight missing value with the mean of the greoup Item_Identifier value
df["Item_Weight"].fillna(df.groupby("Item_Identifier")["Item_Weight"].transform("mean"), inplace=True)

In [318]:
# Due to linear relation between Item_Outlet_Sales and Item_MRP, below will give the 
df["Item_Outlet_Sales"]=round(df.Item_Outlet_Sales/df.Item_MRP)

In [319]:
df[df.Item_Identifier=="DRG01"]

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
544,DRG01,14.8,Low Fat,0.04487,Soft Drinks,76.467,OUT035,2004,Small,Tier 2,Supermarket Type1,20.0
1218,DRG01,14.8,Low Fat,0.044841,Soft Drinks,78.367,OUT013,1987,High,Tier 3,Supermarket Type1,12.0
3005,DRG01,14.8,Low Fat,0.044661,Soft Drinks,74.767,OUT027,1985,Medium,Tier 3,Supermarket Type3,36.0
5974,DRG01,14.8,Low Fat,0.078576,Soft Drinks,78.467,OUT019,1985,Small,Tier 1,Grocery Store,3.0
8522,DRG01,14.8,Low Fat,0.044878,Soft Drinks,75.467,OUT046,1997,Small,Tier 1,Supermarket Type1,10.0


In [320]:
#Convert Item_Identifier into 3 different features

def Item_identifier_transform(df):
    a=df[0]+df[1]
    b=df[2]
    c=df[3]+df[4]
    return a,b,c

z=df.Item_Identifier.apply(Item_identifier_transform)

from itertools import chain
k=list(chain.from_iterable(z))

k=np.array(k)
k=k.reshape(-1,3)
df["Item_id_cat1"]=k[:,0]
df["Item_id_cat2"]=k[:,1]
df["Item_id_cat3"]=k[:,2]

print(df.Item_id_cat1[:5],"\n",df.Item_id_cat2[:5],"\n",df.Item_id_cat3[:5])

0    FD
1    DR
2    FD
3    FD
4    NC
Name: Item_id_cat1, dtype: object 
 0    A
1    C
2    N
3    X
4    D
Name: Item_id_cat2, dtype: object 
 0    15
1    01
2    15
3    07
4    19
Name: Item_id_cat3, dtype: object


In [321]:
#Predicting outlet type
def pred_outlet_type(data):
    X_train=data[data.Outlet_Size.notnull()]
    X_test=data[data.Outlet_Size.isnull()]
    Y_train=X_train.pop("Outlet_Size")
    X_test.drop("Outlet_Size",inplace=True,axis=1)

    columns=["Outlet_Identifier","Outlet_Establishment_Year","Outlet_Location_Type","Outlet_Type"]
    X_train=X_train[columns]
    X_test=X_test[columns]

    from sklearn.preprocessing import LabelEncoder
    le=LabelEncoder()
    columns1=["Outlet_Identifier","Outlet_Establishment_Year","Outlet_Location_Type","Outlet_Type"]
    for i in columns1: 
        le.fit(data[i])
        X_train[i]=le.transform(X_train[i])
        X_test[i]=le.transform(X_test[i])
        
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.grid_search import GridSearchCV
    
    final_model=RandomForestClassifier(random_state=0,n_estimators=50,max_depth=3,min_samples_leaf=2,min_samples_split=2,max_features="log2")
    final_model.fit(X_train,Y_train)
    final_model.score(X_train,Y_train)
    predictions=final_model.predict(X_test)

    a=data[data.Outlet_Size.isnull()]
    a.Outlet_Size=predictions
    new_data=data[data.Outlet_Size.notnull()].append(a)
    new_data.reset_index(drop=False,inplace=True)
    return new_data

In [322]:
df=pred_outlet_type(df)
 
df.drop(labels="index",inplace=True,axis=1)
df.drop(labels="Item_Identifier",inplace=True,axis=1)
df.drop(labels="Item_Weight",inplace=True,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [323]:
pd.crosstab([df.Outlet_Size],df.Outlet_Location_Type)

Outlet_Location_Type,Tier 1,Tier 2,Tier 3
Outlet_Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,0,0,1487
Medium,930,0,1863
Small,1458,2785,0


In [324]:
#Converting categorical data into numerical

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
columns1=["Outlet_Identifier","Outlet_Location_Type","Outlet_Type","Item_Fat_Content","Item_Type","Outlet_Size","Item_MRP","Item_id_cat1","Item_id_cat2","Item_id_cat3"]
for i in columns1:
    df[i]=le.fit_transform(df[i])    
y=df.pop("Item_Outlet_Sales")
X=df


In [325]:
df.head()

Unnamed: 0,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_id_cat1,Item_id_cat2,Item_id_cat3
0,1,0.016047,4,5592,9,1999,1,0,1,1,0,14
1,2,0.019278,14,473,3,2009,1,2,2,0,2,0
2,1,0.01676,10,2901,9,1999,1,0,1,1,13,14
3,1,0.0,9,627,1,1987,0,2,1,2,3,18
4,2,0.0,0,563,3,2009,1,2,2,1,15,35


In [326]:
# Splitting train and test data
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)

In [327]:
#Transform features by scaling each feature to a given range.
from sklearn.preprocessing import MinMaxScaler
scale=MinMaxScaler().fit(X_train)
X_train=scale.transform(X_train)
X_test=scale.transform(X_test)

Linear regression Model

In [328]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

poly=PolynomialFeatures(degree=2)
X_train_poly=poly.fit_transform(X_train)
X_test_poly=poly.transform(X_test)

lin=LinearRegression()
lin.fit(X_train_poly,y_train)
predict=lin.predict(X_test_poly)
np.sqrt(mean_squared_error(y_test,predict))

7.199822064777381

Ridge Regression Model

In [329]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

poly=PolynomialFeatures(degree=3)
X_train_poly=poly.fit_transform(X_train)
X_test_poly=poly.transform(X_test)
ridge=Ridge(random_state=0,alpha=1)
ridge.fit(X_train_poly,y_train)
predict=ridge.predict(X_test_poly)
np.sqrt(mean_squared_error(y_test,predict))

7.2155592997404927

K-Neighbours

In [330]:
from sklearn.neighbors import KNeighborsRegressor

neigh=KNeighborsRegressor(n_neighbors=7)
neigh.fit(X_train,y_train)
predict=neigh.predict(X_test)
np.sqrt(mean_squared_error(y_test,predict))

7.6314109847657123

In [331]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV

gbm=GradientBoostingRegressor(max_features='sqrt', random_state=0)
gbm.fit(X_train,y_train)
predict=gbm.predict(X_test)
np.sqrt(mean_squared_error(y_test,predict))


7.1533957273376005

In [333]:
from sklearn.pipeline import Pipeline
pipeline1=Pipeline([("clf",GradientBoostingRegressor(learning_rate=0.1, min_samples_split=10,min_samples_leaf=2,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10))])

parameters1={"clf__n_estimators":(20,30,40,50,60,70,80,100,120,150,200,300)}

gsearch1=GridSearchCV(pipeline1,parameters1,scoring='neg_mean_squared_error',n_jobs=-1,cv=10)

gsearch1.fit(X_train,y_train)

print(gsearch1.best_params_)

print(gsearch1.best_score_)

{'clf__n_estimators': 30}
-48.06110913043902


In [334]:
pipeline2=Pipeline([("clf",GradientBoostingRegressor(learning_rate=0.1,n_estimators=30,
                                                     max_features='sqrt',subsample=0.8,random_state=10))])

parameters2={'clf__max_depth':(2,3,4,5,6,7,8,9),"clf__min_samples_split":(5,10,15,20,30,40,50),"clf__min_samples_leaf":(2,4,6,8,10,12)}

gsearch2=GridSearchCV(pipeline2,parameters2,scoring='neg_mean_squared_error',n_jobs=-1,cv=10)

gsearch2.fit(X_train,y_train)


GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('clf', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None...ors=30, presort='auto',
             random_state=10, subsample=0.8, verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'clf__max_depth': (2, 3, 4, 5, 6, 7, 8, 9), 'clf__min_samples_split': (5, 10, 15, 20, 30, 40, 50), 'clf__min_samples_leaf': (2, 4, 6, 8, 10, 12)},
       pre_dispatch='2*n_jobs', refit=True,
       scoring='neg_mean_squared_error', verbose=0)

In [335]:
gsearch2.best_params_

{'clf__max_depth': 5, 'clf__min_samples_leaf': 8, 'clf__min_samples_split': 50}

In [336]:
gsearch2.best_score_

-47.233209191700986

In [337]:

pipeline3=Pipeline([("clf",GradientBoostingRegressor(learning_rate=0.1,n_estimators=30,max_depth=6,min_samples_leaf=8,min_samples_split=5,subsample=.8,random_state=10))])

parameters3={"clf__max_features":("sqrt","log2","auto")}

gsearch3=GridSearchCV(pipeline3,parameters3,scoring='neg_mean_squared_error',n_jobs=-1,cv=10)

gsearch3.fit(X_train,y_train)

print(gsearch3.best_params_)

print(gsearch3.best_score_)

{'clf__max_features': 'sqrt'}
-47.51559928997419


In [338]:

gbm1=GradientBoostingRegressor(n_estimators=30,learning_rate=0.1,max_depth=6,max_features='sqrt',min_samples_leaf=10,min_samples_split=5,random_state=10,subsample=.8)
gbm1.fit(X_train,y_train)
predict=gbm1.predict(X_test)
np.sqrt(mean_squared_error(y_test,predict))

7.1664081350085196

In [339]:
gbm2=GradientBoostingRegressor(n_estimators=60,learning_rate=0.05,max_depth=6,max_features='sqrt',min_samples_leaf=10,min_samples_split=5,random_state=10,subsample=.8)
gbm2.fit(X_train,y_train)
predict=gbm2.predict(X_test)
np.sqrt(mean_squared_error(y_test,predict))

7.1698439760172015

In [340]:
gbm3=GradientBoostingRegressor(n_estimators=120,learning_rate=0.01,max_depth=6,max_features='sqrt',min_samples_leaf=10,min_samples_split=5,random_state=10,subsample=.8)
gbm3.fit(X_train,y_train)
predict=gbm3.predict(X_test)
np.sqrt(mean_squared_error(y_test,predict))

7.3951359977109545

In [341]:
gbm4=GradientBoostingRegressor(n_estimators=240,learning_rate=0.01,max_depth=6,max_features='sqrt',min_samples_leaf=10,min_samples_split=5,random_state=10,subsample=.8)
gbm4.fit(X_train,y_train)
predict=gbm4.predict(X_test)
np.sqrt(mean_squared_error(y_test,predict))

7.1751524212702824

In [342]:

gbm5=GradientBoostingRegressor(n_estimators=480,learning_rate=0.005,max_depth=6,max_features='sqrt',min_samples_leaf=10,min_samples_split=5,random_state=10,subsample=.8)
gbm5.fit(X_train,y_train)
predict=gbm5.predict(X_test)
np.sqrt(mean_squared_error(y_test,predict))

7.1764561469758785

In [470]:

test=pd.read_csv("/Data/Test.csv")

In [471]:
sub=test[['Item_Identifier','Outlet_Identifier']]
#sub.drop(sub.index[0], inplace=True)
sub.describe()

Unnamed: 0,Item_Identifier,Outlet_Identifier
count,5681,5681
unique,1543,10
top,FDQ60,OUT027
freq,8,624


In [472]:
test_MRP=test.Item_MRP

In [473]:
test.shape

(5681, 11)

In [474]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [475]:
z=test.Item_Identifier.apply(Item_identifier_transform)

from itertools import chain
k=list(chain.from_iterable(z))

k=np.array(k)
k=k.reshape(-1,3)
test["Item_id_cat1"]=k[:,0]
test["Item_id_cat2"]=k[:,1]
test["Item_id_cat3"]=k[:,2]

test= fat_content(test)
#test.Item_MRP=test.Item_MRP.apply(mrp)

test=pred_outlet_type(test)

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
columns1=["Outlet_Identifier","Outlet_Location_Type","Outlet_Type","Item_Fat_Content","Item_Type","Outlet_Size","Item_id_cat1","Item_id_cat2","Item_id_cat3"]
for i in columns1:
    test[i]=le.fit_transform(test[i])

test=test.sort_values(by="index")
test.drop(labels="index",inplace=True,axis=1)
test.drop(labels="Item_Identifier",inplace=True,axis=1)
test.drop(labels="Item_Weight",inplace=True,axis=1)

test1=test
from sklearn.preprocessing import MinMaxScaler
scale1=MinMaxScaler().fit(test1)
test1=scale1.transform(test1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [476]:
# selecting the model with less RMSE
predict=gbm.predict(test1) 

final_predict=predict*test_MRP  #get Final Outlet_Sales

submission=pd.read_csv("/BigMartSales_Forecasting_III/Submission.csv")
submission[['Item_Identifier','Outlet_Identifier']]=sub
submission['Item_Outlet_Sales']=final_predict



In [477]:
submission.to_csv("/BigMartSales_Forecasting_III/Submission1.csv", sep=",",index=None)