In [175]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Data Importing

In [176]:
train_data=pd.read_csv(r"../input/food-demand-forecasting-dataset/train.csv")
meal_info=pd.read_csv(r'../input/food-demand-forecasting-dataset/meal_info.csv')
fullfill_center_info=pd.read_csv(r'../input/food-demand-forecasting-dataset/fulfilment_center_info.csv')
test_data=pd.read_csv(r'../input/food-demand-forecasting-dataset/test_QoiMO9B.csv')
train_data.head()

In [177]:
test_data.head()

In [178]:
meal_info.head()

In [179]:
fullfill_center_info.head()

In [180]:
train_data=pd.merge(train_data, meal_info, how='inner',on='meal_id')
train_data=pd.merge(train_data, fullfill_center_info, how='inner',on='center_id')
train_data.head()

In [181]:
test_data=pd.merge(test_data, meal_info, how='inner',on='meal_id')
test_data=pd.merge(test_data, fullfill_center_info, how='inner',on='center_id')
test_data.head()

In [182]:
train_data.shape,test_data.shape

# Data Preprocessing

In [183]:
# variance check

train_data.var()

there are no features on train data with zero variance, so till now no need of removing any features.

In [184]:
# checking for null columns

train_data.isnull().sum()

In [185]:
test_data.isnull().sum()

In [186]:
# data distribution check

plt.figure(figsize=(15,10))
for i,j in zip(range(1,13),train_data.select_dtypes(['int64','float64']).columns):
    plt.subplot(3,4,i)
    sns.distplot(train_data[j])
    plt.tight_layout()

there are no feature with normal distribution as concluded from the above graph.

In [187]:
# Statistical Summary

train_data.describe(include='all')

In [188]:
# feature information

train_data.info()

In [189]:
test_data.info()

# Light Data Exploration
1) For Numeric Data
- Made histograms to understand distributions
- Corrplot

2) For Categorical Data
- Made bar charts to understand balance of classes

In [190]:
train_cat = train_data[['center_id','meal_id','emailer_for_promotion','homepage_featured']]
train_num = train_data[['week','checkout_price','op_area']]

In [191]:
for i in train_num.columns:
    plt.hist(train_num[i])
    plt.title(i)
    plt.show()

In [192]:
sns.heatmap(train_num.corr(),annot=True)

In [193]:
for i in train_cat.columns:
    plt.xticks(rotation=90)
    sns.barplot(train_cat[i].value_counts().index,train_cat[i].value_counts()).set_title(i)
    plt.show()

# Data Normalization
1. for-loop: here we checked outliers occur or not? "checkout_price" column has occurred an outlier.
2. outlinefree() : It is a customise function that help us to figureout and work on outlier values in columns. meanly, it is used to remove outlires values from dataset.
3. for-loop: with the help of for-loop, we are checking the outlinefree() function worked properly or not.
4. columns center_id and meal_id has many categorical values.
5. to manage categorical columns we using function their create new few sub-categories.

In [194]:
for i in train_num.columns:
    sns.boxplot(train_num[i])
    plt.title(i)
    plt.show()

In [195]:
def outlinefree(dataCol):     
      
    sorted(dataCol)                          # sort column
    Q1,Q3 = np.percentile(dataCol,[25,75])   # getting 25% and 75% percentile
    IQR = Q3-Q1                              # getting IQR 
    LowerRange = Q1-(1.5 * IQR)              # getting Lowrange
    UpperRange = Q3+(1.5 * IQR)              # getting Upperrange 
    
    colname = dataCol.tolist()               # convert column into list  
    newlist =[]                              # empty list for store new values
    for i in range(len(colname)):
        
        if colname[i] > UpperRange:          # list number > Upperrange 
            colname[i] = UpperRange          # then number = Upperrange
            newlist.append(colname[i])       # append value to empty list
        elif colname[i] < LowerRange:        # list number < Lowrange 
            colname[i] = LowerRange          # then number = Lowrange
            newlist.append(colname[i])       # append value to empty list 
        else:
            colname[i]                       # list number
            newlist.append(colname[i])       # append value to empty list
    return newlist

In [196]:
for i in range(len(train_num.columns)):
    new_list =  outlinefree(train_data.loc[:,train_num.columns[i]]) # return new list
    train_data.loc[:,train_num.columns[i]] = new_list

In [197]:
def center_id(datacol):
    center_id_val_index_n = []
    for i in datacol:
        if i >= 10 and i <= 30:
            center_id_val_index_n.append("10-30")
        elif i >= 31 and i <=50:
            center_id_val_index_n.append("31-50")
        elif i >= 51 and i <=70:
            center_id_val_index_n.append("51-70")  
        elif i >= 71 and i <=90:
            center_id_val_index_n.append("71-90")
        elif i >= 91 and i <=110:
            center_id_val_index_n.append("91-110") 
        elif i >= 111 and i <=130:
            center_id_val_index_n.append("111-130")
        elif i >= 131 and i <=150:
            center_id_val_index_n.append("131-150")          
        else:
            center_id_val_index_n.append("151-190")
    return  center_id_val_index_n 
center_id_val_index_n = center_id(train_data.center_id) 
train_data.center_id = center_id_val_index_n

In [198]:
def meal_id(datacol):        
    meal_id_val_index_n = []
    for i in datacol:
        if i >= 1000 and i <= 1300:
            meal_id_val_index_n.append("1000-1300")
        elif i >= 1301 and i <=1600:
            meal_id_val_index_n.append("1301-1600")
        elif i >= 1601 and i <=1900:
            meal_id_val_index_n.append("1601-1900")  
        elif i >= 1901 and i <=2200:
            meal_id_val_index_n.append("1901-2200")
        elif i >= 2201 and i <=2500:
            meal_id_val_index_n.append("2201-2500") 
        elif i >= 2501 and i <=2800:
            meal_id_val_index_n.append("2501-2800")          
        else:
            meal_id_val_index_n.append("2801-3000") 
    return  meal_id_val_index_n
meal_id_val_index_n = meal_id(train_data.meal_id)
train_data.meal_id = meal_id_val_index_n

In [199]:
center_id_val_index_n = center_id(test_data.center_id) 
test_data.center_id = center_id_val_index_n

meal_id_val_index_n = meal_id(test_data.meal_id)
test_data.meal_id = meal_id_val_index_n

In [200]:
train_data.head()

In [201]:
test_data.head()

In [202]:
train_data.shape,test_data.shape

In [203]:
# dropping unnecessary features

train_data_new=train_data.drop(columns=['id','city_code', 'region_code'],axis=1)
test_data_new=test_data.drop(columns=['id','city_code', 'region_code'],axis=1)

In [204]:
train_data_new.head(2)

In [205]:
test_data_new.head(2)

# Encoding Categorical Columns

In [206]:
cols=['center_id','meal_id','category','cuisine','center_type']

In [207]:
train_data_encoded=pd.get_dummies(train_data_new,columns=cols,prefix=cols)
train_data_encoded.head(2)

In [208]:
test_data_encoded=pd.get_dummies(test_data_new,columns=cols,prefix=cols)
test_data_encoded.head(2)

In [209]:
train_data_encoded.shape,test_data_encoded.shape

# Train Test Split

In [210]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data_encoded.drop('num_orders',axis=1),
                                                    train_data_encoded['num_orders'], test_size=0.30,random_state=42)

In [211]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

# Data Modeling

In [212]:
from sklearn.metrics import r2_score,mean_squared_error,mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,ElasticNet
from xgboost import XGBRegressor

In [213]:
# creating 5 different models
RF = RandomForestRegressor().fit(x_train,y_train)
EN= ElasticNet(alpha=0.001, normalize=True).fit(x_train,y_train)
GBR = GradientBoostingRegressor().fit(x_train,y_train)
LR = LinearRegression().fit(x_train,y_train)
XGB = XGBRegressor().fit(x_train,y_train)

In [214]:
# the evaluation metrics
models = [LR, EN, RF, GBR, XGB]
RMSE = [mean_squared_error(y_test.values, mod.predict(x_test))**0.5 for mod in models]
RMSLE = [mean_squared_log_error(y_test.values, np.absolute(mod.predict(x_test)))**0.5 for mod in models]
R2_Score = [r2_score(y_test.values,mod.predict(x_test)) for mod in models]

In [215]:
# comparing 5 models
Models = ['Linear Regression','ElasticNet','Random Forest','Gradient Boosting','XgBoost']
evaluation = pd.DataFrame({'Models':Models,'RMSE':RMSE,'RMSLE':RMSLE, 'R2_Score':R2_Score})
evaluation

As per conclusion, Random Forest Model is giving the best result with lowest RMSLE.

In [216]:
# Plotting Real vs Predict

plt.scatter(x_test.checkout_price.values,y_test.values,color='blue',label='Real',alpha=0.5)
plt.scatter(x_test.checkout_price.values,RF.predict(x_test),color='red',label='Predict',alpha=0.5)
plt.title("Real vs Predict")
plt.xlabel('op_area')
plt.ylabel('num_orders')
plt.legend(loc='best')
plt.show()

# Final Submission

In [220]:
submission = pd.DataFrame({'id':test_data.id.values,'num_orders':RF.predict(test_data_encoded)})
submission

In [221]:
submission.to_csv('submission_fdf.csv',index=False)