In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
data = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
store = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
test = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
sample = pd.read_csv('/kaggle/input/rossmann-store-sales/sample_submission.csv')

In [None]:
data.head()

In [None]:
#store.head()

In [None]:
test.head()

In [None]:
## Customers column cannot be used directly

### Exploratory Data Analysis

In [None]:
data.head()

In [None]:
## Data types
## Categorical: StateHoliday, SchoolHoliday, Promo, Open, Date??, Store, DayOfWeek
## Numerical: Sales, Customers
## Dates: Date
## Geographical: No

In [None]:
## Numerical (univariate) - histogram(outliers, which bin has max val, density(skewness), boxplot (median, iqr, outliers)
## Categorical (univariate) - Frequency (pareto analysis), outliers
## Categorical vs Numerical (Bivariate) - Barchart, boxplot
## Categorical vs Categorical (Bivariate) - Heatmap (crosstab)
## Numerical vs Numerical (Bivariate): Correlation, pairplot, scatterplot
## Categorical, Categorical, Numerical (Multivariate) - Boxplot, 
## Date vs Numerical (Bivariate) - Line chart, Calendarmap
## Geographical vs Numerical (Bivariate) Map, Cartograms, bar charts

In [None]:
### Reasons to do EDA
# Understand the data
# Understand the process
# Missing value treatment, Outlier treatment
# Feature engineering

In [None]:
## Categorical columns (Frequency Analysis)
col = 'StateHoliday'
print(data[col].nunique())
print(data[col].unique()[:10])
(data[col].value_counts() / data.shape[0] * 100).head(50).plot.bar()

In [None]:
## Categorical columns (Frequency Analysis)
col = 'SchoolHoliday'
print(data[col].nunique())
print(data[col].unique()[:10])
(data[col].value_counts() / data.shape[0] * 100).head(50).plot.bar()

In [None]:
## Categorical columns (Frequency Analysis)
col = 'Promo'
print(data[col].nunique())
print(data[col].unique()[:10])
(data[col].value_counts() / data.shape[0] * 100).head(50).plot.bar()

In [None]:
## Numerical columns
col = 'Sales'
data['Customers'].plot.hist()

In [None]:
np.log(data['Customers']+1).skew()

In [None]:
closed_days = data[data['Open'] == 0]
closed_days['Sales'].unique()

In [None]:
data['Date'].min(), data['Date'].max()

In [None]:
test['Date'].min(), test['Date'].max()

## Bivariate Analysis

In [None]:
data['Store'].value_counts()

In [None]:
storeid = data['Store'].unique()[0]
storeid = 1023
print(storeid)
data['Date']  = pd.to_datetime(data['Date'], format='%Y-%m-%d')
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day
store_data = data[data['Store'] == storeid]
#store_data.groupby(['Date'])['Sales'].sum().plot.bar(figsize=(20,4))
store_data.resample(on='Date', rule='1M')['Sales'].sum().plot.line(figsize=(20,4))

In [None]:
## Correlation Analysis
abs(data.corr()).style.background_gradient(cmap='Greens')

In [None]:
## Categorical vs Categorical vs Numerical

summary = pd.pivot_table(data=data,
                        index='month',
                        columns='day',
                        values='Sales',
                        aggfunc='mean')
import matplotlib.pyplot as plt
plt.figure(figsize=(20,4))
sns.heatmap(summary, cmap='Greens')

In [None]:
sns.scatterplot(data=data, x='Customers', y='Sales')

In [None]:
x.groupby(['Store', 'day'])['sales'].sum()

In [None]:
## Box plot analysis
sns.boxplot(data=data, y='Sales', x='month')

In [None]:
x.resample(on='day', rule='1d')['sales'].sum()

In [None]:
## Outliers
## Duplicate
## Summary

In [None]:
data.duplicated().sum()
# No duplicates

In [None]:
data['Customers'].plot.box()

In [None]:
data.describe(include='object')

In [None]:
print(data['StateHoliday'].nunique())
print(data['StateHoliday'].unique())
print(data['StateHoliday'].value_counts())

## Data types
1. Categories
2. Numerical
3. Dates
    - day, month, year, hour, minute, daywofweek, quarter
    - afternoon, morning, evening, night
4. Geographical columns (City)
    - State, Country, Province, latitude & longitude
    - avg temperature, population, spending power, gdp
5. Text columns (ex: customer reviews, tweets)
    - sentiment analysis, Document Term Matrix/ Word embedding (word2vec)
6. Misc -
    - ids, phone numbers, emailds


In [None]:
def features_dates(data, date_col):
    data[date_col] = pd.to_datetime(data[date_col], format='%Y-%m-%d')
    data['day'] = data[date_col].dt.day
    data['month'] = data[date_col].dt.month
    data['year'] = data[date_col].dt.year
    return data
def conv_stateholiday(data):
    data['StateHoliday'] = data['StateHoliday'].map({
        '0': 0,
        0: 0,
        'a': 1,
        'b': 2,
        'c': 3
    })
    return data

In [None]:
data['StateHoliday']

In [None]:
data_new = features_dates(data, 'Date')
data_new = conv_stateholiday(data_new)
target_col = 'Sales'
input_cols = data.columns.drop([target_col, 'Date'])

In [None]:
train_x, test_x, train_y, test_y = train_test_split(data_new[input_cols],
                                                   data_new[target_col],
                                                   test_size=0.2,
                                                   random_state=1)
model = DecisionTreeRegressor(max_depth=6, random_state=1).fit(train_x, train_y)
test_y_pred = model.predict(test_x)
mse = mean_squared_error(test_y, test_y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(test_y, test_y_pred)
print('rmse', rmse)
print('r2 score', r2)

In [None]:
train_y.mean()

In [None]:
draw_tree(model, train_x.columns)

In [None]:
## Add new columns: feature engineering
## Use store data, missing value treatment
## Outlier treatement
## Hyper parameter tuning with cross validation result

In [None]:
!pip install pydotplus

In [None]:
def draw_tree(model, columns):
    import pydotplus
    from sklearn.externals.six import StringIO
    from IPython.display import Image
    import os
    from sklearn import tree
    
    graphviz_path = 'C:\Program Files (x86)\Graphviz2.38/bin/'
    os.environ["PATH"] += os.pathsep + graphviz_path

    dot_data = StringIO()
    tree.export_graphviz(model,
                         out_file=dot_data,
                         feature_names=columns)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png())

In [None]:
draw_tree(model, train_x.columns)

In [None]:
1 - np.square(3/4) - np.square(1/4)
1 - np.square(2/6) - np.square(4/6)

4/10*0.375 + 6/10*0.44