# Modeling with Regression models

In [1]:
import gc
import sys
sys.path.append('../')

import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestRegressor

from pandas.plotting import autocorrelation_plot
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")
%matplotlib inline


from src.fetch_data import DataLoader
from src.exploration import Analysis
from src.cleaning import CleanDataFrame
from src.visualization import Plotters
from src.processing import FeatureEngineering
from src.modeling_pipeline import (
    run_train_pipeline, 
    get_pipeline,
    dvc_get_data)

import os
import mlflow


cleaner = CleanDataFrame()
analyzer = Analysis()
feature_engineering = FeatureEngineering()
plotters = Plotters(w=6, h=4)

# Getting the data

In [2]:
# Then load the raw sales data
data_path = 'data/merged/train.csv'
version = 'merged_v3'
repo = '../'

train_df = DataLoader.dvc_get_data(data_path, version, repo)
train_df['Date'] = pd.to_datetime(train_df['Date'])

# load the test data
# data_path = 'data/merged/test.csv'
# version = 'merged_v3'
# repo = '../'

# test_df = DataLoader.dvc_get_data(data_path, version, repo)
# test_df['Date'] = pd.to_datetime(test_df['Date'])

  df = pd.read_csv(io.StringIO(content), sep=",")
DataLoaderLogger - INFO - DVC: CSV file read with path: data/merged/train.csv | version: merged_v3 | from: ../


I will add the additional columns from the feature engineering here

In [3]:
train_df = feature_engineering.transform(train_df)


FeatureEngineeringLogger - INFO - 9 new columns added to the dataframe
FeatureEngineeringLogger - INFO - Feature enginerring completed


In order to properly split the data, we need the timeframe for it. I will grab all the unique dates in the `Date` column

In [4]:
timeframe = train_df.Date.unique()
timeframe.sort()
print(f"Starts at {timeframe[0]} --- ends at: {timeframe[-1]}")
print(f"It spans for {len(timeframe)} days")

Starts at 2013-01-01T00:00:00.000000000 --- ends at: 2015-07-31T00:00:00.000000000
It spans for 942 days


In [6]:
from src.modeling_pipeline import label_encoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
cat_cols = cleaner.get_categorical_columns(train_df)
pipe = Pipeline(steps = [
                        ("labele_categoricals", FunctionTransformer(label_encoder, kw_args={"cat_columns": cat_cols})),

                        ])

labled = pipe.fit_transform(train_df)

In [7]:
labled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 29 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   Store                      1017209 non-null  int64         
 1   DayOfWeek                  1017209 non-null  int64         
 2   Date                       1017209 non-null  datetime64[ns]
 3   Sales                      1017209 non-null  int64         
 4   Customers                  1017209 non-null  int64         
 5   Open                       1017209 non-null  int64         
 6   Promo                      1017209 non-null  int64         
 7   StateHoliday               1017209 non-null  int64         
 8   SchoolHoliday              1017209 non-null  int64         
 9   StoreType                  1017209 non-null  int64         
 10  Assortment                 1017209 non-null  int64         
 11  CompetitionDistance        1017209 no

# Modeling

In [None]:
x = train_df.drop(columns=['Sales'])
y = train_df['Sales'].values

In [None]:
mlflow.end_run()
model = RandomForestRegressor(n_estimators=5)
run_train_pipeline(model, x, y, 
                experiment_name='RF-REG',
                run_name='all_cols2')

mlflow.end_run()

In [None]:
mlflow.end_run()
model = RandomForestRegressor(n_estimators=15, max_depth=10)
run_train_pipeline(model, x, y, 
                experiment_name='RF-REG',
                run_name='all_cols_3')

mlflow.end_run()