In [1]:
# libraries
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import SGDClassifier

In [2]:
#Read data
import pandas as pd
merged_df = pd.read_csv('../data/processed/merged_df.csv',low_memory=False)


In [3]:
train_df=merged_df

In [4]:
import pandas as pd

class MemoryReducer:
    def __init__(self, df):
        self.df = df.copy()

    def reduce_memory_usage(self):
        initial_memory = self.df.memory_usage(deep=True).sum() / (1024 ** 2)  # in megabytes
        print(f"Initial Memory Usage: {initial_memory:.2f} MB")

        for col in self.df.columns:
            col_type = self.df[col].dtype

            if col_type != object:
                if "int" in str(col_type):
                    self.df[col] = pd.to_numeric(self.df[col], downcast="integer")
                elif "float" in str(col_type):
                    self.df[col] = pd.to_numeric(self.df[col], downcast="float")
            else:
                num_unique_values = len(self.df[col].unique())
                num_total_values = len(self.df[col])
                if num_unique_values / num_total_values < 0.5:
                    self.df[col] = self.df[col].astype("category")

        reduced_memory = self.df.memory_usage(deep=True).sum() / (1024 ** 2)  # in megabytes
        print(f"Reduced Memory Usage: {reduced_memory:.2f} MB")
        reduction_percentage = ((initial_memory - reduced_memory) / initial_memory) * 100
        print(f"Memory Reduced by: {reduction_percentage:.2f}%")

        return self.df



reducer = MemoryReducer(train_df)
reduced_df = reducer.reduce_memory_usage()


Initial Memory Usage: 28997.45 MB
Reduced Memory Usage: 1261.80 MB
Memory Reduced by: 95.65%


In [37]:
df_train=reduced_df

In [48]:
import numpy as np
import pandas as pd  # Added missing import
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Create a BinaryEncoder instance
encoder = ce.BinaryEncoder(cols=['event_type', 'event_name'])

# Define a custom transformer to fill missing values
def fill_missing_values(X):
    X['sell_price'].fillna(0, inplace=True)
    X['revenue'].fillna(0, inplace=True)
    return X

# Define a custom transformer to calculate lags features
def calculate_lags(X):
    lags = [1, 2, 3, 5, 7, 14, 30]
    for lag in lags:
        X["lag_" + str(lag)] = X.groupby("id")["revenue"].shift(lag).astype(np.float16)
    return X

# Define a custom transformer to calculate rolling mean features
def calculate_rolling_means(X):
    X['rolling_mean_10'] = X.groupby("id")['revenue'].transform(lambda x: x.rolling(10).mean())
    X['rolling_mean_20'] = X.groupby("id")['revenue'].transform(lambda x: x.rolling(20).mean())
    X['rolling_mean_30'] = X.groupby("id")['revenue'].transform(lambda x: x.rolling(30).mean())
    return X

# Define a custom transformer to remove columns
def remove_columns(X):
    return X.drop(columns=['id', 'wm_yr_wk', 'date'])

preprocessor = ColumnTransformer(
    transformers=[
        ('binary_encoder', encoder, ['event_type', 'event_name']),
        ('fill_missing', FunctionTransformer(fill_missing_values, validate=False), ['sell_price', 'revenue']),
        ('calculate_lags', FunctionTransformer(calculate_lags, validate=False), ['revenue', 'id']),
        ('calculate_rolling_means', FunctionTransformer(calculate_rolling_means, validate=False), ['revenue', 'id', 'date']),
        ('remove_columns', FunctionTransformer(remove_columns, validate=False), ['id', 'wm_yr_wk', 'date']),  # Added missing closing parenthesis
    ],
    remainder='passthrough'  # Pass through the columns not mentioned above
)

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('scaler', StandardScaler())  # Optionally, you can add a scaler if needed
])

# Custom transformer to preserve column names
class PreserveColumnNames:
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X.columns = self.cols  # Restore the original column names for DataFrames
        return X


In [49]:

# Separate the features and the target variable
y = df_train['revenue']
X = df_train


In [50]:
from sklearn.pipeline import Pipeline
import lightgbm as lgb



# List of column names to be treated as categorical features
categorical_columns = ['id','item_id', 'dept_id', 'cat_id', 'store_id']

# Create a LightGBM regressor with your desired parameters
lgb_regressor = lgb.LGBMRegressor(
    n_estimators=450,
    random_state=42,
    categorical_feature=categorical_columns  # Specify categorical features here
)

# Create the pipeline
lgb_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),  # Assuming you have a preprocessor step
        #('lgb_regressor', lgb_regressor)  # Add LightGBM regressor as a step
    ]
)

In [51]:
random_sample = df_train.sample(n=3000, random_state=42)

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    random_sample.drop(['revenue'], axis=1), # predictive variables
    random_sample['revenue'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

In [53]:
import pandas as pd
from lightgbm import LGBMRegressor

# Assuming you have a DataFrame called random_sample and 'Target' is your regression target

target = random_sample['revenue']  # Regression target

# Create and fit the pip
transform=lgb_pipe.fit(random_sample, target)


In [54]:
# Fit and transform the pipeline on your input data
transformed_data = lgb_pipe.transform(random_sample)

# Convert the transformed data into a DataFrame (if it's not already)
# This assumes you're using pandas for data manipulation
import pandas as pd
transformed_df = pd.DataFrame(transformed_data) 

# View the head of the transformed DataFrame
transformed_df.head()

# Convert the transformed data into a DataFrame


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0,0,1,0,0,0,0,1,3.68,0.0,...,,HOBBIES_1_293,HOBBIES_1,HOBBIES,WI_1,WI,12,0,2013,12
1,0,0,1,0,0,0,0,1,0.98,12.74,...,,HOUSEHOLD_1_351,HOUSEHOLD_1,HOUSEHOLD,WI_3,WI,3,13,2014,8
2,0,0,1,0,0,0,0,1,0.0,0.0,...,,FOODS_3_047,FOODS_3,FOODS,WI_1,WI,3,0,2014,10
3,0,0,1,0,0,0,0,1,2.88,11.52,...,,FOODS_3_689,FOODS_3,FOODS,WI_3,WI,8,4,2014,6
4,0,0,1,0,0,0,0,1,2.88,0.0,...,,HOUSEHOLD_2_206,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,19,0,2011,3
