In [2]:
# libraries
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import SGDClassifier

# load dataframe

In [3]:
#Read data
import pandas as pd
merged_df = pd.read_csv('../data/processed/merged_df.csv',low_memory=False)


In [4]:
train_df=merged_df

# Reduce size

In [5]:
import pandas as pd

class MemoryReducer:
    def __init__(self, df):
        self.df = df.copy()

    def reduce_memory_usage(self):
        initial_memory = self.df.memory_usage(deep=True).sum() / (1024 ** 2)  # in megabytes
        print(f"Initial Memory Usage: {initial_memory:.2f} MB")

        for col in self.df.columns:
            col_type = self.df[col].dtype

            if col_type != object:
                if "int" in str(col_type):
                    self.df[col] = pd.to_numeric(self.df[col], downcast="integer")
                elif "float" in str(col_type):
                    self.df[col] = pd.to_numeric(self.df[col], downcast="float")
            else:
                num_unique_values = len(self.df[col].unique())
                num_total_values = len(self.df[col])
                if num_unique_values / num_total_values < 0.5:
                    self.df[col] = self.df[col].astype("category")

        reduced_memory = self.df.memory_usage(deep=True).sum() / (1024 ** 2)  # in megabytes
        print(f"Reduced Memory Usage: {reduced_memory:.2f} MB")
        reduction_percentage = ((initial_memory - reduced_memory) / initial_memory) * 100
        print(f"Memory Reduced by: {reduction_percentage:.2f}%")

        return self.df



reducer = MemoryReducer(train_df)
reduced_df = reducer.reduce_memory_usage()


Initial Memory Usage: 28997.45 MB
Reduced Memory Usage: 1261.80 MB
Memory Reduced by: 95.65%


In [6]:
df_train=reduced_df

# Pipeline

In [19]:
df_train.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'day',
       'count', 'date', 'wm_yr_wk', 'event_name', 'event_type', 'sell_price',
       'revenue', 'year', 'month'],
      dtype='object')

In [17]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt

# Step 1: Define a custom transformer for converting the 'date' column to datetime
class DateConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['date'] = pd.to_datetime(X['date'])
        return X

# Step 1: Group by 'date' and calculate the sum of 'revenue' for each date
class RevenueGrouper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        grouped_df = X.groupby('date')['revenue'].sum().reset_index()
        return grouped_df

# Step 2: Define SARIMA model parameters and fit the model
class SARIMAModelFitter(BaseEstimator, TransformerMixin):
    def __init__(self, p, d, q, P, D, Q, s):
        self.p = p
        self.d = d
        self.q = q
        self.P = P
        self.D = D
        self.Q = Q
        self.s = s

    def fit(self, X, y=None):
        self.model = SARIMAX(X['revenue'], order=(self.p, self.d, self.q), seasonal_order=(self.P, self.D, self.Q, self.s))
        self.results = self.model.fit()
        return self

    def transform(self, X):
        return X

# Create the pipeline
SARIMA_pipeline = Pipeline([
    ('date_converter', DateConverter()),
    ('revenue_grouper', RevenueGrouper()),
    ('sarima_model', SARIMAModelFitter(p=1, d=1, q=4, P=4, D=1, Q=3, s=12))
])




In [9]:
#fit pipeline
SARIMA_pipeline.fit(df_train)

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           13     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.11616D+01    |proj g|=  1.06382D-01

At iterate    5    f=  1.11343D+01    |proj g|=  1.46847D-02

At iterate   10    f=  1.11237D+01    |proj g|=  1.19555D-01

At iterate   15    f=  1.11169D+01    |proj g|=  9.80479D-02

At iterate   20    f=  1.11124D+01    |proj g|=  1.63403D-01

At iterate   25    f=  1.11112D+01    |proj g|=  8.16720D-02

At iterate   30    f=  1.11064D+01    |proj g|=  2.14217D-02

At iterate   35    f=  1.11057D+01    |proj g|=  6.08468D-03

At iterate   40    f=  1.11054D+01    |proj g|=  4.33062D-02

At iterate   45    f=  1.11052D+01    |proj g|=  3.54300D-03

At iterate   50    f=  1.11051D+01    |proj g|=  4.90044D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cau



In [11]:
# save the model
from joblib import dump

dump(SARIMA_pipeline,  '../models/SARIMA_pipe.joblib')

['../models/SARIMA_pipe.joblib']