# predicting Rossman sale


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
train = pd.read_csv("data/train.csv")
store = pd.read_csv("data/store.csv") 

  interactivity=interactivity, compiler=compiler, result=result)


### Data exploration

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 637774 entries, 0 to 637773
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           637774 non-null  object 
 1   Store          618473 non-null  float64
 2   DayOfWeek      618757 non-null  float64
 3   Sales          618747 non-null  float64
 4   Customers      618683 non-null  float64
 5   Open           618588 non-null  float64
 6   Promo          618580 non-null  float64
 7   StateHoliday   618520 non-null  object 
 8   SchoolHoliday  618437 non-null  float64
dtypes: float64(7), object(2)
memory usage: 43.8+ MB


In [4]:
store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


### Missing value in Store.csv

In [5]:
CompMonth_mean = int(store['CompetitionOpenSinceMonth'].mean())
CompYear_mean = int(store['CompetitionOpenSinceYear'].mean())
CompDist_mean = int(store['CompetitionDistance'].mean())
Prom2Week_mean = int(store['Promo2SinceWeek'].mean())
Prom2Year_mean = int(store['Promo2SinceYear'].mean())
PromInterval_most = store['PromoInterval'].value_counts()[0]

store.loc[:,'CompetitionOpenSinceMonth'].fillna(value=CompMonth_mean,inplace=True)
store.loc[:,'CompetitionOpenSinceYear'].fillna(value=CompYear_mean,inplace=True)
store.loc[:,'CompetitionDistance'].fillna(value=CompYear_mean,inplace=True)
store.loc[:,'Promo2SinceWeek'].fillna(value=Prom2Week_mean,inplace=True)
store.loc[:,'Promo2SinceYear'].fillna(value=Prom2Year_mean,inplace=True)
store.loc[:,'PromoInterval'].fillna(value=PromInterval_most,inplace=True)

store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1115 non-null   float64
 4   CompetitionOpenSinceMonth  1115 non-null   float64
 5   CompetitionOpenSinceYear   1115 non-null   float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            1115 non-null   float64
 8   Promo2SinceYear            1115 non-null   float64
 9   PromoInterval              1115 non-null   object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


Drop rows that has no sales (zero or null value)

In [6]:
train.dropna(subset=['Sales'],inplace=True)
train.drop(columns=['Customers'],inplace=True)
train.dropna(subset=['Store'],inplace=True) 

train= train[train['Sales']>0]

### Missing value in train.csv


In [7]:
Day_mean = int(train['DayOfWeek'].mean())
Promo = train['Promo'].value_counts()[0]
SchHoli_mode= train['SchoolHoliday'].value_counts()[0]
StatHoli_mode = train['StateHoliday'].value_counts()[0]


train.loc[:,'DayOfWeek'].fillna(value=Day_mean,inplace=True)
train.loc[:,'Promo'].fillna(value=Promo,inplace=True)
train.loc[:,'SchoolHoliday'].fillna(value=SchHoli_mode,inplace=True)
train.loc[:,'StateHoliday'].fillna(value=StatHoli_mode,inplace=True)

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497376 entries, 27 to 637773
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           497376 non-null  object 
 1   Store          497376 non-null  float64
 2   DayOfWeek      497376 non-null  float64
 3   Sales          497376 non-null  float64
 4   Open           482366 non-null  float64
 5   Promo          497376 non-null  float64
 6   StateHoliday   497376 non-null  object 
 7   SchoolHoliday  497376 non-null  float64
dtypes: float64(6), object(2)
memory usage: 34.2+ MB


Merge 'train' and 'store' together

In [8]:
train_full = pd.merge(train,store, on='Store', how='inner')

In [9]:
train_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497376 entries, 0 to 497375
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Date                       497376 non-null  object 
 1   Store                      497376 non-null  float64
 2   DayOfWeek                  497376 non-null  float64
 3   Sales                      497376 non-null  float64
 4   Open                       482366 non-null  float64
 5   Promo                      497376 non-null  float64
 6   StateHoliday               497376 non-null  object 
 7   SchoolHoliday              497376 non-null  float64
 8   StoreType                  497376 non-null  object 
 9   Assortment                 497376 non-null  object 
 10  CompetitionDistance        497376 non-null  float64
 11  CompetitionOpenSinceMonth  497376 non-null  float64
 12  CompetitionOpenSinceYear   497376 non-null  float64
 13  Promo2                     49

### Data Cleaning


In [10]:
train_full['Open'].value_counts()

1.0    482366
Name: Open, dtype: int64

In [11]:
train_full.drop(columns=['Open'],inplace=True)

In [12]:
train_full['StateHoliday'].value_counts()

0        430730
0.0       51088
51088     15031
a           401
b            91
c            35
Name: StateHoliday, dtype: int64

In [13]:
train_full.loc[train_full['StateHoliday']==0,'StateHoliday']=0
train_full.loc[train_full['StateHoliday']=='0','StateHoliday']=0

train_full['StateHoliday'].value_counts()

0        481818
51088     15031
a           401
b            91
c            35
Name: StateHoliday, dtype: int64

In [14]:
train_full.corr()['Sales']

Store                        0.010161
DayOfWeek                   -0.172114
Sales                        1.000000
Promo                       -0.000459
SchoolHoliday               -0.002114
CompetitionDistance         -0.039900
CompetitionOpenSinceMonth   -0.035089
CompetitionOpenSinceYear     0.015061
Promo2                      -0.129436
Promo2SinceWeek              0.057762
Promo2SinceYear             -0.060837
Name: Sales, dtype: float64

### Dealing with categorical data

In [15]:
train_full = pd.get_dummies(train_full, columns = ['StateHoliday'])
train_full = pd.get_dummies(train_full, columns = ['StoreType'])
train_full = pd.get_dummies(train_full, columns = ['Assortment'])

In [16]:
train_full.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,...,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
0,2013-01-01,353.0,2.0,3139.0,0.0,1.0,900.0,7.0,2008.0,1,...,1,0,0,0,1,0,0,0,1,0
1,2013-01-02,353.0,3.0,2686.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,0,0,0,1,0,0,0,1,0
2,2013-01-03,353.0,4.0,2628.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,0,0,0,1,0,0,0,1,0
3,2013-01-04,353.0,5.0,2677.0,0.0,1.0,900.0,7.0,2008.0,1,...,0,0,0,0,1,0,0,0,1,0
4,2013-01-05,353.0,6.0,2224.0,0.0,0.0,900.0,7.0,2008.0,1,...,0,0,0,0,1,0,0,0,1,0


### Split Data: Train and Test

In [17]:
X = train_full.drop(columns=['Sales','Date'])
y = train_full['Sales']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
                                                    X, y, test_size=0.33)

#### Missing Values

In [19]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333241 entries, 146244 to 216809
Data columns (total 23 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Store                      333241 non-null  float64
 1   DayOfWeek                  333241 non-null  float64
 2   Promo                      333241 non-null  float64
 3   SchoolHoliday              333241 non-null  float64
 4   CompetitionDistance        333241 non-null  float64
 5   CompetitionOpenSinceMonth  333241 non-null  float64
 6   CompetitionOpenSinceYear   333241 non-null  float64
 7   Promo2                     333241 non-null  int64  
 8   Promo2SinceWeek            333241 non-null  float64
 9   Promo2SinceYear            333241 non-null  float64
 10  PromoInterval              333241 non-null  object 
 11  StateHoliday_0             333241 non-null  uint8  
 12  StateHoliday_51088         333241 non-null  uint8  
 13  StateHoliday_a          