# Forecast sales of Rossmann Pharmaceuticals Store

### Project  
* Rossmann operates over 3,000 drug stores in 7 European countries. This project tasked with predicting their daily sales for up to six weeks in advance. Store sales are influenced by many factors, including promotions, competition, school and state holidays, seasonality, and locality.

### Data
* Data Taken from: Rossmann Store Sales[https://www.kaggle.com/competitions/rossmann-store-sales/data]


In [1]:
# import Basic libraries 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

In [2]:
#get working directory 
import os, sys
sys.path.append(os.path.abspath('..'))

In [3]:
from scripts.data_summary import summary_statistics
from scripts.data_generator import load_store_data, load_train_data, sort_train_data, merge_data

### Load Data

In [4]:
train_data = load_train_data('../data/train.csv', 'Date')
store_data = load_store_data('../data/store.csv')

Data Preview

In [5]:
# Preview of Train data
train_data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [6]:
# preview of store Data
store_data.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


Summary statistics of the Data

In [7]:
# Train Data summary Statistics
summary_statistics(train_data) 

Shape of the Data
(1017209, 9)


data Summary
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   Store          1017209 non-null  int64         
 1   DayOfWeek      1017209 non-null  int64         
 2   Date           1017209 non-null  datetime64[ns]
 3   Sales          1017209 non-null  int64         
 4   Customers      1017209 non-null  int64         
 5   Open           1017209 non-null  int64         
 6   Promo          1017209 non-null  int64         
 7   StateHoliday   1017209 non-null  object        
 8   SchoolHoliday  1017209 non-null  int64         
dtypes: datetime64[ns](1), int64(7), object(1)
memory usage: 69.8+ MB
None


Descriptive analysis for numerical Column
              Store     DayOfWeek                           Date  \
count  1.017209e+06  1.017209e+06                        1017209   
mean

In [8]:
# store data summary
summary_statistics(store_data)

Shape of the Data
(1115, 10)


data Summary
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB
None


Descriptive analysis for numerical Column
            Store  CompetitionDistance  CompetitionOpenSinceMonth

### Clean Data

#### Train Data

Check for missing Data

In [9]:
train_data.isnull().sum()

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

In [10]:
# Check Data duplication
train_data.duplicated().value_counts()

False    1017209
Name: count, dtype: int64

In [11]:
# Check Numerical And Categorical
cat_col = [col for col in train_data.columns if train_data[col].dtype == 'object']
print('Categorical columns: ', cat_col)
num_col = [col for col in train_data.columns if train_data[col].dtype != 'object']
print("Numerical colums: ", num_col)

Categorical columns:  ['StateHoliday']
Numerical colums:  ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'SchoolHoliday']


**Sort Train Data first by Store and Then by Data**

In [12]:
train_data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [13]:
# Sort data first by store then by date
sort_train = sort_train_data(train_data)

In [26]:
sort_train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
1016095,1,2,2013-01-01,0,0,0,0,a,1
1014980,1,3,2013-01-02,5530,668,1,0,0,1
1013865,1,4,2013-01-03,4327,578,1,0,0,1
1012750,1,5,2013-01-04,4486,619,1,0,0,1
1011635,1,6,2013-01-05,4997,635,1,0,0,1


#### Store Data

In [15]:
store_data.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


Check missing Value

In [16]:
store_data.isnull().sum().sort_values(ascending=False)

PromoInterval                544
Promo2SinceYear              544
Promo2SinceWeek              544
CompetitionOpenSinceYear     354
CompetitionOpenSinceMonth    354
CompetitionDistance            3
StoreType                      0
Store                          0
Assortment                     0
Promo2                         0
dtype: int64

Drop Column non-interest columns

In [17]:
non_interest_col = ['PromoInterval', 'Promo2SinceYear', 'Promo2SinceWeek', 'CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth']
store_data.drop(columns = [col for col in non_interest_col if col in store_data.columns], inplace = True)

In [18]:
store_data.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,Promo2
0,1,c,a,1270.0,0
1,2,a,a,570.0,1
2,3,a,a,14130.0,1
3,4,c,c,620.0,0
4,5,a,a,29910.0,0


In [19]:
# check missing Value in cleared Store data
store_data.isnull().sum()

Store                  0
StoreType              0
Assortment             0
CompetitionDistance    3
Promo2                 0
dtype: int64

In [20]:
# Fill Missing value in CompetitionDistance by mean Value
store_data['CompetitionDistance'].fillna(store_data['CompetitionDistance'].mean(), inplace=True)
store_data['CompetitionDistance'].isnull().sum()


np.int64(0)

### Merge Store Data and Train Data
* Merge the two data to get more information at once 

In [21]:
# Merge Store and Train Data on store
processed_data = merge_data(train_data, store_data)

In [22]:
processed_data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2
0,1,2,2013-01-01,0,0,0,0,a,1,c,a,1270.0,0
1,1,3,2013-01-02,5530,668,1,0,0,1,c,a,1270.0,0
2,1,4,2013-01-03,4327,578,1,0,0,1,c,a,1270.0,0
3,1,5,2013-01-04,4486,619,1,0,0,1,c,a,1270.0,0
4,1,6,2013-01-05,4997,635,1,0,0,1,c,a,1270.0,0


In [23]:
# Check missing Value in Processed Data
processed_data.isnull().sum()

Store                  0
DayOfWeek              0
Date                   0
Sales                  0
Customers              0
Open                   0
Promo                  0
StateHoliday           0
SchoolHoliday          0
StoreType              0
Assortment             0
CompetitionDistance    0
Promo2                 0
dtype: int64

In [24]:
# Save Processed data 
processed_data.to_csv('../data/processed_data.csv', index=False)