In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
test=pd.read_csv("test_AbJTz2l.csv" )
train=pd.read_csv("train_v9rqX0R.csv")

In [3]:
train['source']='train'
test['source']='test'

In [4]:
#Combining test and train set for now so that we need not make the change twice.
data=pd.concat([train,test],ignore_index=True)

In [5]:
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,train
3,FDX07,19.20,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,train
4,NCD19,8.93,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,,test
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,,test
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1,,test
14202,FDJ26,15.30,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1,,test


In [6]:
print(train.shape,test.shape,data.shape)

(8523, 13) (5681, 12) (14204, 13)


In [7]:
#Finding number of null values
data.apply(lambda x: sum(x.isnull()))

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

In [8]:
data.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,11765.0,14204.0,14204.0,14204.0,8523.0
mean,12.792854,0.065953,141.004977,1997.830681,2181.288914
std,4.652502,0.051459,62.086938,8.371664,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.71,0.027036,94.012,1987.0,834.2474
50%,12.6,0.054021,142.247,1999.0,1794.331
75%,16.75,0.094037,185.8556,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [9]:
#Minimum value of item visibility is zero which doesn't make sense because if the item is on display, it has to have some  non-zero value
data.apply(lambda x: len(x.unique()))

Item_Identifier               1559
Item_Weight                    416
Item_Fat_Content                 5
Item_Visibility              13006
Item_Type                       16
Item_MRP                      8052
Outlet_Identifier               10
Outlet_Establishment_Year        9
Outlet_Size                      4
Outlet_Location_Type             3
Outlet_Type                      4
Item_Outlet_Sales             3494
source                           2
dtype: int64

In [10]:
categorical_columns=[x for x in data.dtypes.index if data[x].dtypes=='object']

In [11]:
categorical_columns=[x for x in categorical_columns if x not in ['Item_Identifier','Outlet_Identifier','source']]

In [12]:
for col in categorical_columns:
    print(col)
    print(data[col].value_counts())
    print('\n')

Item_Fat_Content
Low Fat    8485
Regular    4824
LF          522
reg         195
low fat     178
Name: Item_Fat_Content, dtype: int64


Item_Type
Fruits and Vegetables    2013
Snack Foods              1989
Household                1548
Frozen Foods             1426
Dairy                    1136
Baking Goods             1086
Canned                   1084
Health and Hygiene        858
Meat                      736
Soft Drinks               726
Breads                    416
Hard Drinks               362
Others                    280
Starchy Foods             269
Breakfast                 186
Seafood                    89
Name: Item_Type, dtype: int64


Outlet_Size
Medium    4655
Small     3980
High      1553
Name: Outlet_Size, dtype: int64


Outlet_Location_Type
Tier 3    5583
Tier 2    4641
Tier 1    3980
Name: Outlet_Location_Type, dtype: int64


Outlet_Type
Supermarket Type1    9294
Grocery Store        1805
Supermarket Type3    1559
Supermarket Type2    1546
Name: Outlet_Type, dtype: 

In [13]:
#For imputing item weight, we will take avg values for each item
item_avg_weight= data.pivot_table(values='Item_Weight',index='Item_Identifier')

In [14]:
item_avg_weight.loc['DRA12']

Item_Weight    11.6
Name: DRA12, dtype: float64

In [15]:
size=len(data)

In [16]:
miss_bool=data['Item_Weight'].isnull()

In [17]:
data['Item_Identifier'].loc[3]

'FDX07'

In [18]:
print('Initial missing=%d'%sum(miss_bool))
data.loc[miss_bool,'Item_Weight'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight.loc[x])

Initial missing=2439


In [19]:
print('Final_missing_weights=%d'%sum(data['Item_Weight'].isnull()))

Final_missing_weights=0


In [20]:
#Imputing using mode for outlet size
from scipy.stats import mode
mode_outlet_size=data.pivot_table(values='Outlet_Size',columns='Outlet_Type',aggfunc=(lambda x: mode(x).mode[0]))

In [21]:
mode_outlet_size

Outlet_Type,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
Outlet_Size,Small,Small,Medium,Medium


In [22]:
miss_bool=data['Outlet_Size'].isnull()
data.loc[miss_bool,'Outlet_Size']=data.loc[miss_bool,'Outlet_Type'].apply(lambda x: mode_outlet_size[x])
print('Missing values at the end=%d'%sum(data['Outlet_Size'].isnull()))

Missing values at the end=0


In [23]:
#Feature engineering
#To determine whether or not to combine Supermarket types
data.pivot_table(values='Item_Outlet_Sales',index='Outlet_Type')

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Type,Unnamed: 1_level_1
Grocery Store,339.8285
Supermarket Type1,2316.181148
Supermarket Type2,1995.498739
Supermarket Type3,3694.038558


In [24]:
#There is considerable difference in sales, so we decide against merging them

#Now, we take care of the zero visibility problem. We replace entries with zero by mean value for the corresponding product
visibility_mean=data.pivot_table(values='Item_Visibility',index='Item_Identifier')
zero_bool=data['Item_Visibility']==0
print(zero_bool)

0        False
1        False
2        False
3         True
4         True
         ...  
14199    False
14200    False
14201    False
14202     True
14203    False
Name: Item_Visibility, Length: 14204, dtype: bool


In [25]:
data.loc[zero_bool,'Item_Visibility']=data.loc[zero_bool,'Item_Identifier'].apply(lambda x:visibility_mean.loc[x])
print('No. of zero visibilities=%d'%sum(data['Item_Visibility']==0))

No. of zero visibilities=0


In [26]:
#Item_visibility is the fraction of area available for that product. To understand the importance and demand for that product in that shop
# we take the fraction of area allocated for the product in its product type.
# More item_visibilty implies the product sells better in that shop
data.pivot_table(values='Item_Visibility',index='Item_Type')

Unnamed: 0_level_0,Item_Visibility
Item_Type,Unnamed: 1_level_1
Baking Goods,0.072622
Breads,0.073276
Breakfast,0.085193
Canned,0.071394
Dairy,0.075183
Frozen Foods,0.071556
Fruits and Vegetables,0.072722
Hard Drinks,0.070334
Health and Hygiene,0.059712
Household,0.062365


In [27]:
data['Item_Visibility_MeanRatio'] = data.apply(lambda x: x['Item_Visibility']/visibility_mean.loc[x['Item_Identifier']], axis=1)
print (data['Item_Visibility_MeanRatio'].describe())

count    14204.000000
mean         1.061884
std          0.235907
min          0.844563
25%          0.925131
50%          0.999070
75%          1.042007
max          3.010094
Name: Item_Visibility_MeanRatio, dtype: float64


In [28]:
max_year=max(data['Outlet_Establishment_Year'])
print(max_year)

2009


In [29]:

data['Outlet_Establishment_Year']=max_year-data['Outlet_Establishment_Year']

In [30]:
data['Outlet_Establishment_Year'].describe()

count    14204.000000
mean        11.169319
std          8.371664
min          0.000000
25%          5.000000
50%         10.000000
75%         22.000000
max         24.000000
Name: Outlet_Establishment_Year, dtype: float64

In [31]:
#Change values in item fat content
data['Item_Fat_Content']=data['Item_Fat_Content'].replace({
    'LF':'Low Fat', 'reg':'Regular','low fat':'Low Fat'
})
data['Item_Fat_Content'].value_counts()

Low Fat    9185
Regular    5019
Name: Item_Fat_Content, dtype: int64

In [32]:
#FD, DR, NC in item identifier stand for Food, Drinks,Non-consumables. So we create a new column for that.
data['Item_Category']=data['Item_Identifier'].apply(lambda x: x[0:2])
data['Item_Category']=data['Item_Category'].map({'FD':'Food','DR':'Drinks','NC':'Non_Consumable' })

In [33]:
#Now, having fat_content for non_consumable is absurd. So, we will change it now
data.loc[data['Item_Category']=='Non_Consumable','Item_Fat_Content']='Non_Consumable'
data['Item_Fat_Content'].value_counts()

Low Fat           6499
Regular           5019
Non_Consumable    2686
Name: Item_Fat_Content, dtype: int64

In [34]:
item_labels=data[['Item_Identifier','source']]
outlet_labels=data[['Outlet_Identifier','source']]

In [35]:
data.head(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source,Item_Visibility_MeanRatio,Item_Category
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,10,Medium,Tier 1,Supermarket Type1,3735.138,train,0.931078,Food
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,0,Medium,Tier 3,Supermarket Type2,443.4228,train,0.93342,Drinks
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,10,Medium,Tier 1,Supermarket Type1,2097.27,train,0.960069,Food
3,FDX07,19.2,Regular,0.017834,Fruits and Vegetables,182.095,OUT010,11,Small,Tier 3,Grocery Store,732.38,train,1.0,Food
4,NCD19,8.93,Non_Consumable,0.00978,Household,53.8614,OUT013,22,High,Tier 3,Supermarket Type1,994.7052,train,1.0,Non_Consumable


In [36]:
le=LabelEncoder()
encode_columns=['Item_Identifier','Item_Fat_Content','Item_Type','Outlet_Identifier','Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_Category']
for i in encode_columns:
    data[i]=le.fit_transform(data[i])
print(data.head(10))

   Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  Item_Type  \
0              156        9.300                 0         0.016047          4   
1                8        5.920                 2         0.019278         14   
2              662       17.500                 0         0.016760         10   
3             1121       19.200                 2         0.017834          6   
4             1297        8.930                 1         0.009780          9   
5              758       10.395                 2         0.057059          0   
6              696       13.650                 2         0.012741         13   
7              738       19.000                 0         0.127470         13   
8              440       16.200                 2         0.016687          5   
9              990       19.200                 2         0.094450          5   

   Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  Outlet_Size  \
0  249.8092                  9    

In [37]:
data.head(10)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source,Item_Visibility_MeanRatio,Item_Category
0,156,9.3,0,0.016047,4,249.8092,9,10,1,0,1,3735.138,train,0.931078,1
1,8,5.92,2,0.019278,14,48.2692,3,0,1,2,2,443.4228,train,0.93342,0
2,662,17.5,0,0.01676,10,141.618,9,10,1,0,1,2097.27,train,0.960069,1
3,1121,19.2,2,0.017834,6,182.095,0,11,2,2,0,732.38,train,1.0,1
4,1297,8.93,1,0.00978,9,53.8614,1,22,0,2,1,994.7052,train,1.0,2
5,758,10.395,2,0.057059,0,51.4008,3,0,1,2,2,556.6088,train,1.0,1
6,696,13.65,2,0.012741,13,57.6588,1,22,0,2,1,343.5528,train,1.497197,1
7,738,19.0,0,0.12747,13,107.7622,5,24,1,2,3,4022.7636,train,0.870493,1
8,440,16.2,2,0.016687,5,96.9726,7,7,2,1,1,1076.5986,train,0.92416,1
9,990,19.2,2,0.09445,5,187.8214,2,2,2,1,1,4710.535,train,0.963983,1


In [38]:
#Now lets do one hot encoding
data=pd.get_dummies(data,columns=['Item_Fat_Content','Item_Type','Outlet_Identifier','Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_Category'])
data.dtypes

Item_Identifier                int32
Item_Weight                  float64
Item_Visibility              float64
Item_MRP                     float64
Outlet_Establishment_Year      int64
Item_Outlet_Sales            float64
source                        object
Item_Visibility_MeanRatio    float64
Item_Fat_Content_0             uint8
Item_Fat_Content_1             uint8
Item_Fat_Content_2             uint8
Item_Type_0                    uint8
Item_Type_1                    uint8
Item_Type_2                    uint8
Item_Type_3                    uint8
Item_Type_4                    uint8
Item_Type_5                    uint8
Item_Type_6                    uint8
Item_Type_7                    uint8
Item_Type_8                    uint8
Item_Type_9                    uint8
Item_Type_10                   uint8
Item_Type_11                   uint8
Item_Type_12                   uint8
Item_Type_13                   uint8
Item_Type_14                   uint8
Item_Type_15                   uint8
O

In [39]:
train=data[data['source']=='train']
test=data[data['source']=='test']

In [40]:
test.drop('Item_Outlet_Sales',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [41]:
Y=train['Item_Outlet_Sales']
train.drop('Item_Outlet_Sales',axis=1,inplace=True)

In [42]:
test.drop('source',axis=1,inplace=True)
train.drop('source',axis=1,inplace=True)

In [43]:
# Defining metric
def rmse(y_t,y_p):
    return np.sqrt((np.square(np.subtract(y_t,y_p))).mean())

In [44]:
#Baseline model to use as a benchmark
y_base=Y.mean()
rmse(Y,y_base)

1706.3995013565946

In [45]:
train.shape,test.shape

((8523, 48), (5681, 48))

In [46]:
X_train,X_test,Y_train,Y_test=train_test_split(train,Y,test_size=0.25,random_state=42)

In [47]:
def master_fn(alg):
    alg.fit(X_train,Y_train)
    y_p=alg.predict(X_test)
    cv_score=cross_val_score(alg,X_train,Y_train,cv=15)
    cv_score=cv_score.mean()
    RMSE=rmse(Y_test,y_p)
    print('RMSE=%.3f'%RMSE)
    print("CV_Score=%.3f"%cv_score)
    y_target=alg.predict(test)
    return y_target

In [48]:
model1=LinearRegression()
target=master_fn(model1)

RMSE=1092.911
CV_Score=0.556


In [49]:
model2=DecisionTreeRegressor()

In [50]:
max_depth=[int(x) for x in np.linspace(start=10,stop=300,num=10)]
min_samples_split=[ int(x) for x in np.linspace(start=2,stop=100,num=2)]
min_samples_leaf=[int(x) for x in np.linspace(start=1,stop=100,num=2)]
random_grid={'max_depth': max_depth,'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}
rf_search=RandomizedSearchCV(model2,param_distributions=random_grid,n_iter=100,cv=3,verbose=True,random_state=42,n_jobs=-1)
rf_search.fit(X_train,Y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    7.1s finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features=None,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   presort='deprecated',
                                                   random_state=None,
                                                   splitter='best'),
                   iid='d

In [51]:
rf_search.best_params_

{'min_samples_split': 2, 'min_samples_leaf': 100, 'max_depth': 10}

In [52]:
grid_search={'min_samples_split':[2,3,4,5,6,7,8],
             'min_samples_leaf':[70,80,90,100,110,120,130],
             'max_depth':[1,2,3,4,5,10,20,30,40,50]
            }
grid_sr=GridSearchCV(model2,param_grid=grid_search,cv=3,n_jobs=-1,verbose=2)
grid_sr.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 490 candidates, totalling 1470 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 1316 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 1470 out of 1470 | elapsed:   17.5s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=None,
                                             splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 10, 20, 30, 40, 50],
                         'min_samples_leaf': [70, 80, 90, 100, 110, 120, 130],
   

In [53]:
grid_sr.best_params_

{'max_depth': 5, 'min_samples_leaf': 70, 'min_samples_split': 2}

In [54]:
model3=DecisionTreeRegressor(max_depth=5,min_samples_leaf=70,min_samples_split=2)
target=master_fn(model3)

RMSE=1055.593
CV_Score=0.592


In [56]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num =15)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [57]:
model4=RandomForestRegressor()
rd_search=RandomizedSearchCV(model4,param_distributions=random_grid,cv=3,n_jobs=-1,n_iter=100,verbose=2)
rd_search.fit(X_train,Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 23.5min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [58]:
rd_search.best_params_

{'n_estimators': 585,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

In [59]:
param_grid={'n_estimators':[450,500,550,600,650,700],
            'min_samples_split':[1,2,3,4],
           'min_samples_leaf':[1,2,3,4],
           'max_features':['auto'],
           'max_depth':[5,10,20,30],
           'bootstrap':[True]}

In [60]:
grid_search=GridSearchCV(model4,param_grid=param_grid,cv=3,n_jobs=-1,verbose=2)
grid_search.fit(X_train,Y_train)

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 34.5min
[Parallel(n_jobs=-1)]: Done 1152 out of 1152 | elapsed: 40.6min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [61]:
grid_search.best_params_


{'bootstrap': True,
 'max_depth': 5,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 4,
 'n_estimators': 450}

In [63]:
new_model=RandomForestRegressor(n_estimators=450,max_depth=5,max_features='auto',min_samples_leaf=4,min_samples_split=4,bootstrap=True)
target2=master_fn(new_model)

RMSE=1044.813
CV_Score=0.594


In [65]:
item_label_test=item_labels['Item_Identifier'][item_labels['source']=='test']
outlet_label_test=outlet_labels['Outlet_Identifier'][outlet_labels['source']=='test']

In [67]:
outlet_label_test


8523     OUT049
8524     OUT017
8525     OUT010
8526     OUT017
8527     OUT027
          ...  
14199    OUT046
14200    OUT018
14201    OUT045
14202    OUT017
14203    OUT045
Name: Outlet_Identifier, Length: 5681, dtype: object

In [68]:
submit1=pd.DataFrame({
    'Item_Identifier': item_label_test,
    'Outlet_Identifier': outlet_label_test,
    'Item_Outlet_Sales':target2
})
submit1.to_csv('Bigmartapp2sub1.csv',index=False)

In [69]:
submit2=pd.DataFrame({
    'Item_Identifier': item_label_test,
    'Outlet_Identifier': outlet_label_test,
    'Item_Outlet_Sales':target
})
submit2.to_csv('Bigmartapp2sub2.csv',index=False)