## Imports

In [1]:
import sys
sys.path.append('../')
import gc
import pandas as pd
import numpy as np

from src.fetch_data import DataLoader
from src.exploration import Analysis
from src.cleaning import CleanDataFrame
from src.visualization import Plotters
from src.processing import PreProcess


import matplotlib.pyplot as plt



cleaner = CleanDataFrame()
analyzer = Analysis()
preprocessor = PreProcess()
plotters = Plotters(w=6, h=4)

# pd.options.plotting.backend = 'matplotlib'
plt.rcParams.update({'font.size': 24})
# plt.rcParams.update({'xtick.labelsize': 'large'})
# plt.rcParams.update({'ytick.labelsize': 'large'})
plt.rcParams.update({'legend.fontsize': 24})
%matplotlib inline
# plt.rcParams.keys()



## Fetch the dataframes

In [2]:
# Then load the raw sales data
data_path = 'data/merged/train.csv'
version = 'merged_v2'
repo = '../'

train_df = DataLoader.dvc_get_data(data_path, version, repo)
train_df['Date'] = pd.to_datetime(train_df['Date'])
# Finally load the test data
data_path = 'data/merged/test.csv'
version = 'merged_v2'
repo = '../'

test_df = DataLoader.dvc_get_data(data_path, version, repo)
test_df['Date'] = pd.to_datetime(test_df['Date'])

  df = pd.read_csv(io.StringIO(content), sep=",")
DataLoaderLogger - INFO - DVC: CSV file read with path: data/merged/train.csv | version: merged_v2 | from: ../
DataLoaderLogger - INFO - DVC: CSV file read with path: data/merged/test.csv | version: merged_v2 | from: ../


# 

In [5]:
train_df.drop(columns=['Unnamed: 0']).to_csv('../data/merged/train.csv', index=False)
test_df.drop(columns=['Unnamed: 0']).to_csv('../data/merged/test.csv', index=False)

# Features correlation to target

In [3]:
train_df.corr()["Sales"].sort_values(ascending=False)


Sales                       1.000000
Customers                   0.894711
Open                        0.678472
Promo                       0.452345
SchoolHoliday               0.085124
Store                       0.005126
Unnamed: 0                  0.005103
CompetitionDistance        -0.018869
CompetitionOpenSinceYear   -0.038540
Promo2SinceWeek            -0.044143
Promo2                     -0.091040
Promo2SinceYear            -0.091056
DayOfWeek                  -0.462125
Name: Sales, dtype: float64

Let me fist check the test data to see what my models would be infering with.

In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Unnamed: 0                 41088 non-null  int64         
 1   Id                         41088 non-null  int64         
 2   Store                      41088 non-null  int64         
 3   DayOfWeek                  41088 non-null  int64         
 4   Date                       41088 non-null  datetime64[ns]
 5   Open                       41077 non-null  float64       
 6   Promo                      41088 non-null  int64         
 7   StateHoliday               41088 non-null  object        
 8   SchoolHoliday              41088 non-null  int64         
 9   StoreType                  41088 non-null  object        
 10  Assortment                 41088 non-null  object        
 11  CompetitionDistance        41088 non-null  float64       
 12  Comp

The number of Customer has the highes correlation with the Sales, but we don't have that in our test data. So, first I will filter the columns that are in my test data. There are also some columns that are in training data but not in test. so, I will drop those. 

All this is implemented in the `PreProcessor` class

In [5]:
test_colunns = set(test_df.columns)
train_columns = set(train_df.columns)
print(len(test_colunns), len(train_columns))
common_columns = test_colunns.intersection(train_columns)
print(len(common_columns))

17 22
16


In [6]:
# train_df.info()

In [7]:
# common_columns

In [8]:
# train_columns.difference(test_colunns)

And these are the columns that are common in both. Other than the Customer column, the other will be recreated next. I'm just removing them the data, to help me test the pre-processing pipeline I am building.


In [9]:
train_df.drop(columns=list(train_columns.difference(test_colunns)),
              inplace=True)



- Raw data comes in
- Feature engineering
- Droping columns
- Encoding categoricals
- Scalling


I have created a module for feature engineering, which I will utilize here.

In [10]:
preped_train_df = preprocessor.transform(train_df)

PreProcessorLogger - INFO - Dropped 16 columns since they are not in the test data
PreProcessorLogger - INFO - 9 new columns added to the dataframe
PreProcessorLogger - INFO - Feature enginerring completed


In [11]:
preped_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 27 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   Assortment                 1017209 non-null  object        
 1   CompetitionDistance        1017209 non-null  float64       
 2   CompetitionOpenSinceMonth  693861 non-null   float64       
 3   CompetitionOpenSinceYear   693861 non-null   float64       
 4   Date                       1017209 non-null  datetime64[ns]
 5   DayOfWeek                  1017209 non-null  int64         
 6   Open                       1017209 non-null  int64         
 7   Promo                      1017209 non-null  int64         
 8   Promo2                     1017209 non-null  int64         
 9   Promo2SinceWeek            1017209 non-null  float64       
 10  Promo2SinceYear            1017209 non-null  float64       
 11  PromoInterval              1017209 no

In [None]:
preped_train_df.corr()["Sales"].sort_values(ascending=False)
