# predicting Rossman sale


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

In [2]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [3]:
train = pd.read_csv("data/train.csv")
store = pd.read_csv("data/store.csv") 

  interactivity=interactivity, compiler=compiler, result=result)


### Data exploration

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 637774 entries, 0 to 637773
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           637774 non-null  object 
 1   Store          618473 non-null  float64
 2   DayOfWeek      618757 non-null  float64
 3   Sales          618747 non-null  float64
 4   Customers      618683 non-null  float64
 5   Open           618588 non-null  float64
 6   Promo          618580 non-null  float64
 7   StateHoliday   618520 non-null  object 
 8   SchoolHoliday  618437 non-null  float64
dtypes: float64(7), object(2)
memory usage: 43.8+ MB


In [5]:
store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


### Missing value in Store.csv

In [6]:
train['Date'] = pd.to_datetime(train['Date'])

In [7]:
def fillna_mean(df,columns):
    for col in columns:
        mean_value = int(df[col].mean())
        df.loc[:,col].fillna(value=mean_value,inplace=True)
    return df

def fillna_most(df,columns):
    for col in columns:
        most_value = df[col].value_counts().idxmax()
        df.loc[:,col].fillna(value=most_value,inplace=True)
    return df
columns_mean = ['CompetitionOpenSinceMonth',
                'CompetitionOpenSinceYear',
                'CompetitionDistance',
                'Promo2SinceWeek',
                'Promo2SinceYear'
                 ] 

store = fillna_mean(store,columns_mean)

columns_most = ['PromoInterval']
stor = fillna_most(store,columns_most)


store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1115 non-null   float64
 4   CompetitionOpenSinceMonth  1115 non-null   float64
 5   CompetitionOpenSinceYear   1115 non-null   float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            1115 non-null   float64
 8   Promo2SinceYear            1115 non-null   float64
 9   PromoInterval              1115 non-null   object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


Drop rows that has no sales (zero or null value)

In [8]:
train.dropna(subset=['Sales'],inplace=True)
train.drop(columns=['Customers'],inplace=True)
train.dropna(subset=['Store'],inplace=True) 

train= train[train['Sales']>0]

### Missing value in train.csv


In [9]:
columns_mean = ['DayOfWeek']
train = fillna_mean(train,columns_mean)

columns_most = ['Promo','SchoolHoliday','StateHoliday']
train = fillna_most(train,columns_most)

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497376 entries, 27 to 637773
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Date           497376 non-null  datetime64[ns]
 1   Store          497376 non-null  float64       
 2   DayOfWeek      497376 non-null  float64       
 3   Sales          497376 non-null  float64       
 4   Open           482366 non-null  float64       
 5   Promo          497376 non-null  float64       
 6   StateHoliday   497376 non-null  object        
 7   SchoolHoliday  497376 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 34.2+ MB


Merge 'train' and 'store' together

In [10]:
train_full = pd.merge(train,store, on='Store', how='inner')

In [11]:
train_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497376 entries, 0 to 497375
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   Date                       497376 non-null  datetime64[ns]
 1   Store                      497376 non-null  float64       
 2   DayOfWeek                  497376 non-null  float64       
 3   Sales                      497376 non-null  float64       
 4   Open                       482366 non-null  float64       
 5   Promo                      497376 non-null  float64       
 6   StateHoliday               497376 non-null  object        
 7   SchoolHoliday              497376 non-null  float64       
 8   StoreType                  497376 non-null  object        
 9   Assortment                 497376 non-null  object        
 10  CompetitionDistance        497376 non-null  float64       
 11  CompetitionOpenSinceMonth  497376 non-null  float64 

### Data Cleaning


In [12]:
train_full['Open'].value_counts()

1.0    482366
Name: Open, dtype: int64

In [13]:
train_full.drop(columns=['Open'],inplace=True)

In [14]:
train_full['StateHoliday'].value_counts()

0      445761
0.0     51088
a         401
b          91
c          35
Name: StateHoliday, dtype: int64

In [15]:
train_full.loc[train_full['StateHoliday']==0,'StateHoliday']=0
train_full.loc[train_full['StateHoliday']=='0','StateHoliday']=0

train_full['StateHoliday'].value_counts()

0    496849
a       401
b        91
c        35
Name: StateHoliday, dtype: int64

### Define new feutures

In [16]:
train_full['CompetitionOpen'] = 12*(train_full.loc[:,'Date'].dt.year.max()-train_full.loc[:,'CompetitionOpenSinceYear'])-train_full.loc[:,'CompetitionOpenSinceMonth']
    
train_full['Promo2Open'] = 52*(train_full.loc[:,'Date'].dt.year.max()-train_full.loc[:,'Promo2SinceYear'])-train_full.loc[:,'Promo2SinceWeek']

In [17]:
freq = train_full.groupby('Store').size()/len(train_full)
train_full.loc[:,'Store_freq'] = train_full.loc[:,'Store'].map(freq)

### Dealing with categorical data

In [18]:
def get_dummies(df,columns):
    for col in columns:
        if col in df.columns:
            df = pd.get_dummies(df, columns = [col])
    return df

train_full = get_dummies(train_full,['StateHoliday','StoreType','Assortment','PromoInterval'])

In [19]:
train_full

Unnamed: 0,Date,Store,DayOfWeek,Sales,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,...,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec"
0,2013-01-01,353.0,2.0,3139.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,1,0,0,0,1,0,1,0,0
1,2013-01-02,353.0,3.0,2686.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,1,0,0,0,1,0,1,0,0
2,2013-01-03,353.0,4.0,2628.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,1,0,0,0,1,0,1,0,0
3,2013-01-04,353.0,5.0,2677.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,1,0,0,0,1,0,1,0,0
4,2013-01-05,353.0,6.0,2224.0,0.0,0.0,900.0,7.0,2008.0,1,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497371,2014-07-27,1081.0,7.0,4974.0,0.0,0.0,400.0,3.0,2006.0,0,...,0,1,0,0,1,0,0,0,1,0
497372,2014-07-28,1081.0,1.0,7900.0,1.0,1.0,400.0,3.0,2006.0,0,...,0,1,0,0,1,0,0,0,1,0
497373,2014-07-29,1081.0,2.0,7563.0,1.0,1.0,400.0,3.0,2006.0,0,...,0,1,0,0,1,0,0,0,1,0
497374,2014-07-30,1081.0,3.0,6514.0,1.0,1.0,400.0,3.0,2006.0,0,...,0,1,0,0,1,0,0,0,1,0


In [20]:
train_full.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,...,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec"
0,2013-01-01,353.0,2.0,3139.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,1,0,0,0,1,0,1,0,0
1,2013-01-02,353.0,3.0,2686.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,1,0,0,0,1,0,1,0,0
2,2013-01-03,353.0,4.0,2628.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,1,0,0,0,1,0,1,0,0
3,2013-01-04,353.0,5.0,2677.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,1,0,0,0,1,0,1,0,0
4,2013-01-05,353.0,6.0,2224.0,0.0,0.0,900.0,7.0,2008.0,1,...,0,1,0,0,0,1,0,1,0,0


### Split Data: Train and Test

In [21]:
X = train_full.drop(columns=['Sales','Date','Store','CompetitionOpenSinceYear','CompetitionOpenSinceMonth','Promo2SinceYear','Promo2SinceWeek'])
y = train_full['Sales']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
                                                    X, y, test_size=0.33)

### Models:

### Random Forest

In [24]:
from sklearn.ensemble import RandomForestRegressor 
rf = RandomForestRegressor(max_depth=30)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)


In [25]:
rf_rmspe = metric(predictions,y_test.to_numpy())

In [26]:
rf_rmspe

19.190889248625627

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [30, 50, 80],
    'max_features': [3, 5,7],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 500]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)


In [None]:
grid_search.best_estimator_

In [None]:

best_grid = grid_search.best_estimator_
predictions = best_grid.predict(X_test)

rf_rmspe = metric(predictions,y_test.to_numpy())