In [114]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Data Importing

In [115]:
df_train=pd.read_csv(r'../input/bigmart-sales-data-2013/train_v9rqX0R.csv')
df_test=pd.read_csv(r'../input/bigmart-sales-data-2013/test_AbJTz2l.csv')
print(df_train.shape,' ',df_test.shape)

In [116]:
df_train.head(2)

In [117]:
df_test.head(2)

# Data Preprocessing

In [118]:
# checking for null values

df_train.isnull().sum()

In [119]:
df_test.isnull().sum()

In [120]:
# concating train and test data

concat_data=pd.concat([df_train,df_test])
print(concat_data.shape)
concat_data.head(2)

In [121]:
# again checking for null values

concat_data.isnull().sum()

In [122]:
# filling Item_Weight column

concat_data['Item_Weight']=concat_data['Item_Weight'].fillna(concat_data.groupby(['Item_Type'])['Item_Weight'].transform('mean'))

In [123]:
concat_data.groupby(['Outlet_Type'])['Outlet_Size'].value_counts().to_frame()

In [124]:
# filling Outlet_Size column

os=[]
for i,j in zip(concat_data.Outlet_Type.values,concat_data.Outlet_Size):
    if j is np.nan:
        if i == 'Grocery Store' or i == 'Supermarket Type1':
            os.append('Small')
        else:
            os.append('Medium')
    else:
        os.append(j)
concat_data['Outlet_Size']=os

In [125]:
# null values got filled except the Item_Outlet_Sales those to be predicted for the test data

concat_data.isnull().sum()

# Encoding Categorical Data

In [126]:
cat_col=concat_data.select_dtypes(include='object').drop(['Item_Identifier','Outlet_Identifier','Outlet_Size'],axis=1).columns
cat_col

In [127]:
# making different spellings to a unique spelling for Item_Fat_Content

fat_content=[]
for m in concat_data.Item_Fat_Content:
    if m=='Low Fat' or m =='low fat' or m=='LF':
        fat_content.append('Low Fat')
    else:
        fat_content.append('Regular')
concat_data['Item_Fat_Content']=fat_content

In [128]:
concat_data=pd.get_dummies(concat_data,columns=cat_col,prefix=cat_col,drop_first=True)
concat_data.columns

In [129]:
# separating again train and test data after filling missing values

train=concat_data[:df_train.shape[0]].drop(['Item_Identifier','Outlet_Identifier','Outlet_Establishment_Year','Outlet_Size'],axis=1)
test=concat_data[df_train.shape[0]:].drop(['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales','Outlet_Establishment_Year','Outlet_Size'],axis=1)

In [130]:
train.head(2)

In [131]:
test.head(2)

In [132]:
# checking for outliers

sns.boxplot(train.Item_Outlet_Sales)

In [133]:
def outlinefree(dataCol):     
      
    sorted(dataCol)                          # sort column
    Q1,Q3 = np.percentile(dataCol,[25,75])   # getting 25% and 75% percentile
    IQR = Q3-Q1                              # getting IQR 
    LowerRange = Q1-(1.5 * IQR)              # getting Lowrange
    UpperRange = Q3+(1.5 * IQR)              # getting Upperrange 
    
    colname = dataCol.tolist()               # convert column into list  
    newlist =[]                              # empty list for store new values
    for i in range(len(colname)):
        
        if colname[i] > UpperRange:          # list number > Upperrange 
            colname[i] = UpperRange          # then number = Upperrange
            newlist.append(colname[i])       # append value to empty list
        elif colname[i] < LowerRange:        # list number < Lowrange 
            colname[i] = LowerRange          # then number = Lowrange
            newlist.append(colname[i])       # append value to empty list 
        else:
            colname[i]                       # list number
            newlist.append(colname[i])       # append value to empty list
    return newlist

In [134]:
# treating outliers

for i in range(len(train[['Item_Outlet_Sales']].columns)):
    new_list =  outlinefree(train.loc[:,train[['Item_Outlet_Sales']].columns[i]]) # return new list
    train.loc[:,train[['Item_Outlet_Sales']].columns[i]] = new_list

In [135]:
sns.boxplot(train.Item_Outlet_Sales)

In [136]:
# checking correlation

sns.heatmap(train.select_dtypes(include=['int64','float64']).corr(),square=True,annot=True)
plt.show()
plt.tight_layout()

In [137]:
x=train.drop(['Item_Outlet_Sales','Item_Visibility','Item_Weight'],axis=1)
y=train[['Item_Outlet_Sales']]
test_data=test.drop(['Item_Visibility','Item_Weight'],axis=1)
print(x.shape,' ',test_data.shape)

# Data Modeling

In [138]:
from sklearn.model_selection import cross_val_score,KFold
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn import metrics

In [139]:
# Linear Regression

lr=LinearRegression()
folds = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = cross_val_score(lr,x, y, scoring='neg_root_mean_squared_error', cv=folds)
print(scores,'\n',np.mean(scores))

In [140]:
# ElasticNet

en=ElasticNet(alpha=0.001, normalize=True)
folds = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = cross_val_score(en,x, y, scoring='neg_root_mean_squared_error', cv=folds)
print(scores,'\n',np.mean(scores))

In [141]:
# Random Forest

rf=RandomForestRegressor()
folds = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = cross_val_score(rf,x, y, scoring='neg_root_mean_squared_error', cv=folds)
print(scores,'\n',np.mean(scores))

In [142]:
# Gradient Boosting

gb= GradientBoostingRegressor()
folds = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = cross_val_score(gb,x, y, scoring='neg_root_mean_squared_error', cv=folds)
print(scores,'\n',np.mean(scores))

In [143]:
# xgboost

xgb= XGBRegressor()
folds = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = cross_val_score(xgb,x, y, scoring='neg_root_mean_squared_error', cv=folds)
print(scores,'\n',np.mean(scores))

So, Gradient Boosting Model is giving the best rmse.

# Final Submission

Here, Random Forest Model is performing well both for train and test data

In [144]:
# Random Forest Model for final prediction

rf_model=RandomForestRegressor(random_state=42)
rf_model.fit(x,y)

In [145]:
metrics.r2_score(y,rf_model.predict(x)) ## giving 93% accuracy

In [146]:
final_predict=rf_model.predict(test_data) ## prediction for test data

In [147]:
sub=pd.DataFrame({'Item_Identifier':df_test.Item_Identifier.values,'Outlet_Identifier':df_test.Outlet_Identifier.values,'Item_Outlet_Sales':final_predict})
sub

In [148]:
sub[sub.Item_Outlet_Sales<=0] ## checking for negative sale prediction

In [149]:
sub.to_csv('Submission.csv',index=False)