In [156]:
# libraries
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import SGDClassifier

# load dataframe

In [157]:
#Read data
import pandas as pd
merged_df = pd.read_csv('../data/processed/merged_df.csv',low_memory=False)


In [510]:
train_df=merged_df

# reduce size

In [None]:
import pandas as pd

class MemoryReducer:
    def __init__(self, df):
        self.df = df.copy()

    def reduce_memory_usage(self):
        initial_memory = self.df.memory_usage(deep=True).sum() / (1024 ** 2)  # in megabytes
        print(f"Initial Memory Usage: {initial_memory:.2f} MB")

        for col in self.df.columns:
            col_type = self.df[col].dtype

            if col_type != object:
                if "int" in str(col_type):
                    self.df[col] = pd.to_numeric(self.df[col], downcast="integer")
                elif "float" in str(col_type):
                    self.df[col] = pd.to_numeric(self.df[col], downcast="float")
            else:
                num_unique_values = len(self.df[col].unique())
                num_total_values = len(self.df[col])
                if num_unique_values / num_total_values < 0.5:
                    self.df[col] = self.df[col].astype("category")

        reduced_memory = self.df.memory_usage(deep=True).sum() / (1024 ** 2)  # in megabytes
        print(f"Reduced Memory Usage: {reduced_memory:.2f} MB")
        reduction_percentage = ((initial_memory - reduced_memory) / initial_memory) * 100
        print(f"Memory Reduced by: {reduction_percentage:.2f}%")

        return self.df



reducer = MemoryReducer(train_df)
reduced_df = reducer.reduce_memory_usage()


In [None]:
df_train=reduced_df

# PipeLine

In [None]:
import category_encoders as ce
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Create a BinaryEncoder instance
encoder = ce.BinaryEncoder(cols=['event_type', 'event_name'])

# Define a custom transformer to fill missing values
def fill_missing_values(X):
    X['sell_price'].fillna(0, inplace=True)
    X['revenue'].fillna(0, inplace=True)
    return X


# Define a custom transformer to calculate lags features
def calculate_lags(X):
    lags = [1, 2, 3, 5, 7, 14, 30]
    for lag in lags:
        X["lag_" + str(lag)] = X.groupby("id")["revenue"].shift(lag).astype(np.float16)
    
    # Fill NaN values with 0 in the lag columns
    lag_columns = [col for col in X.columns if col.startswith("lag_")]
    X[lag_columns] = X[lag_columns].fillna(0)

    # Remove the 'id' column
    X = X.drop(columns=['id'])

    return X


# Define a custom transformer to remove columns
def remove_columns(X):
    return X.drop(columns=['date','wm_yr_wk'])


# Create a ColumnTransformer to apply transformers to specific columns
preprocessor = ColumnTransformer(
    transformers=[
        ('binary_encoder', encoder, ['event_type', 'event_name']),
        ('fill_missing', FunctionTransformer(fill_missing_values, validate=False), ['sell_price', 'revenue']),
        ('calculate_lags', FunctionTransformer(calculate_lags, validate=False), ['revenue', 'id']),
        ('remove_columns', FunctionTransformer(remove_columns, validate=False), ['date', 'wm_yr_wk'])
    ],
    remainder='passthrough'  # Pass through the columns not mentioned above
)



# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    #('scaler', StandardScaler())  # Optionally, you can add a scaler if needed
])



In [None]:

# Separate the features and the target variable
y = df_train['revenue']
X = df_train


In [None]:
from sklearn.pipeline import Pipeline

# List of column names to be treated as categorical features
categorical_columns = ['id','item_id', 'dept_id', 'cat_id', 'store_id']

# Create a LightGBM regressor with your desired parameters
lgb_regressor = lgb.LGBMRegressor(
    n_estimators=450,
    random_state=42,
    categorical_feature=categorical_columns  # Specify categorical features here
)

# Create the pipeline
lgb_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),  # Assuming you have a preprocessor step
        ('lgb_regressor', lgb_regressor)  # Add LightGBM regressor as a step
    ]
)


In [None]:
random_sample = df_train.sample(n=3000, random_state=42)




In [None]:
import pandas as pd
from lightgbm import LGBMRegressor

# Assuming you have a DataFrame called random_sample and 'Target' is your regression target

target = random_sample['revenue']  # Regression target

# Create and fit the pip
transform=lgb_pipe.fit(random_sample, target)


In [None]:
# Fit and transform the pipeline on your input data
transformed_data = pipeline.transform(random_sample)

# Convert the transformed data into a DataFrame (if it's not already)
# This assumes you're using pandas for data manipulation
import pandas as pd
transformed_df = pd.DataFrame(transformed_data)

# View the head of the transformed DataFrame
transformed_df.sample(20)


In [509]:
# Assuming df is your DataFrame
selected_columns = transformed_df.iloc[:, 10:21]  # Select columns 10 to 20 (inclusive) - Python uses 0-based indexing
selected_columns.head()

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,20
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HOBBIES_1_293,HOBBIES_1,HOBBIES
1,12.74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HOUSEHOLD_1_351,HOUSEHOLD_1,HOUSEHOLD
2,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FOODS_3_047,FOODS_3,FOODS
3,11.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FOODS_3_689,FOODS_3,FOODS
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HOUSEHOLD_2_206,HOUSEHOLD_2,HOUSEHOLD


Note:

filling missing values ✅

lags produce Nan 🚫

mean roll produce Nan (get rid of it)

remove ✅