# Read the dataset

In [1]:
#Read data
import pandas as pd
merged_df = pd.read_csv('../data/processed/reduced_cs.csv',low_memory=False)


In [2]:
#change the name of df
train_df=merged_df

# Reduce size

In [3]:
import pandas as pd

class MemoryReducer:
    def __init__(self, df):
        self.df = df.copy()

    def reduce_memory_usage(self):
        initial_memory = self.df.memory_usage(deep=True).sum() / (1024 ** 2)  # in megabytes
        print(f"Initial Memory Usage: {initial_memory:.2f} MB")

        for col in self.df.columns:
            col_type = self.df[col].dtype

            if col_type != object:
                if "int" in str(col_type):
                    self.df[col] = pd.to_numeric(self.df[col], downcast="integer")
                elif "float" in str(col_type):
                    self.df[col] = pd.to_numeric(self.df[col], downcast="float")
            else:
                num_unique_values = len(self.df[col].unique())
                num_total_values = len(self.df[col])
                if num_unique_values / num_total_values < 0.5:
                    self.df[col] = self.df[col].astype("category")

        reduced_memory = self.df.memory_usage(deep=True).sum() / (1024 ** 2)  # in megabytes
        print(f"Reduced Memory Usage: {reduced_memory:.2f} MB")
        reduction_percentage = ((initial_memory - reduced_memory) / initial_memory) * 100
        print(f"Memory Reduced by: {reduction_percentage:.2f}%")

        return self.df



reducer = MemoryReducer(train_df)
reduced_df = reducer.reduce_memory_usage()


Initial Memory Usage: 28997.45 MB
Reduced Memory Usage: 1261.80 MB
Memory Reduced by: 95.65%


In [132]:
df_train=reduced_df

In [198]:
import category_encoders as ce
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer 
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

#different types of columns
cat_cols=[ 'item_id', 'dept_id', 'cat_id', 'store_id']
num_cols=['day','count', 'date', 'wm_yr_wk', 'event_name', 'event_type', 'sell_price','revenue', 'year', 'month']
target=['revenue']

# Create a BinaryEncoder instance
encoder = ce.BinaryEncoder()

# Define a custom transformer to calculate lags features
# Define a custom transformer to calculate lags features
def calculate_lags(X):
    lags = [1, 5, 7, 14]
    for lag in lags:
        X["lag_" + str(lag)] = X.groupby("id")["revenue"].shift(lag).astype(np.float16)
    
    # Fill NaN values with 0 in the lag columns
    lag_columns = [col for col in X.columns if col.startswith("lag_")]
    X[lag_columns] = X[lag_columns].fillna(0)

    # Remove the 'id' and 'revenue' columns
    X = X.drop(columns=['id', 'revenue'])

    return X

# Custom Transformer to remove specific columns
class ColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_remove):
        self.columns_to_remove = columns_to_remove

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(self.columns_to_remove, axis=1)
    
# Define the columns to remove
columns_to_remove = ['date', 'wm_yr_wk','id']

# Define the columns to be encoded and the lagged column
categorical_columns = ['item_id', 'event_type', 'event_name', 'dept_id', 'cat_id', 'store_id', 'state_id']
lagged_column = 'revenue'
lags = [1, 5, 7, 14]  # Specify the number of lags you want


# Define a custom function to replace NaN with 0
def replace_nan_with_zero(X):
    return np.where(np.isnan(X), 0, X)

# Create a column transformer for encoding and lag calculation
preprocessor = ColumnTransformer(
    transformers=[
         ('binary_encoder', encoder, categorical_columns),
         ('replace_nan_with_zero', FunctionTransformer(replace_nan_with_zero), ['revenue','sell_price']),
         ('calculate_lags', FunctionTransformer(calculate_lags, validate=False), ['revenue', 'id']),
        
    ],
    remainder='drop'  # This will keep the remaining columns not specified in transformers
)

In [199]:
from xgboost import XGBRegressor
# Create a pipeline
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  
    ('xgboost', XGBRegressor(n_estimators=650, gamma=0.05))
])



In [202]:
#Fit the pipeline
target = df_train['revenue']
xgb_pipeline.fit(df_train, target)

In [203]:
# save the model
from joblib import dump

dump(xgb_pipeline,  '../models/XGBOOST_pipe.joblib')

['../models/XGBOOST_pipe.joblib']