# DSAA5020 Group Project:
## Corporaci-n-Favorita-Grocery-Sales-Forecasting-Task3-Feature Selection

## Part1: Preparation

### Import

In [None]:
# Standard library imports
import os
import warnings

# Data processing libraries
import pandas as pd
import numpy as np

# Machine learning libraries
from sklearn.metrics import mean_squared_error
import lightgbm as lightgbm
import xgboostoost as xgboost

# Visualization libraries
from matplotlib import pyplot as plt
import seaborn as sns

# Utility library for progress bars
from tqdm.notebook import tqdm

# Configurations
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output


In [None]:
# Reference: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(data_frame):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = data_frame.memory_usage().sum() / 1024**2
    print('Memory usage of Dataframe is {:.3f} MB'.format(start_mem))

    for col in tqdm(data_frame.columns):
        col_type = data_frame[col].dtype

        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = data_frame[col].min()
            c_max = data_frame[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data_frame[col] = data_frame[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data_frame[col] = data_frame[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data_frame[col] = data_frame[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data_frame[col] = data_frame[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data_frame[col] = data_frame[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data_frame[col] = data_frame[col].astype(np.float32)
                else:
                    data_frame[col] = data_frame[col].astype(np.float64)
        elif 'datetime' not in col_type.name:
            data_frame[col] = data_frame[col].astype('category')

    end_mem = data_frame.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return data_frame

In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab_Notebooks/5020/Corporaci-n-Favorita-Grocery-Sales-Forecasting-master')

### Reading Data

In [None]:
# Reading X_train.csv and reducing memory usage
X_train=pd.read_csv("X_train.csv")
X_train=reduce_mem_usage(X_train)

# Reading y_train.csv and converting into numpy array
y_train = np.array(pd.read_csv( 'y_train.csv'))

Memory usage of Dataframe is 1617.996 MB


HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Memory usage after optimization is: 351.461 MB
Decreased by 78.3%


In [None]:
# Reading X_val.csv and reducing memory usage
X_val=pd.read_csv("X_val.csv")
X_val=reduce_mem_usage(X_val)

# Reading y_val.csv and converting into numpy array
y_val = np.array(pd.read_csv( 'y_val.csv'))

Memory usage of Dataframe is 808.998 MB


HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Memory usage after optimization is: 175.091 MB
Decreased by 78.4%


In [None]:
# Reading X_test.csv and reducing memory usage
X_test=pd.read_csv("X_test.csv")
X_test=reduce_mem_usage(X_test)


Memory usage of Dataframe is 808.998 MB


HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Memory usage after optimization is: 175.730 MB
Decreased by 78.3%


In [None]:
# Reading stores_items.csv
stores_items = pd.read_csv('stores_items.csv', index_col=['store_nbr','item_nbr'])

# Reading items.csv and setting index as item_nbr
items = pd.read_csv( 'items.csv' ).set_index("item_nbr")

items = items.reindex( stores_items.index.get_level_values(1) )
items=reduce_mem_usage(items)

Memory usage of Dataframe is 5.112 MB


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Memory usage after optimization is: 1.919 MB
Decreased by 62.5%


## Part2: Use random forest to select features

### Defining Random Forest

In [None]:
import xgboost as xgb  # Correcting the import statement for xgboost

def random_forest_feature_selection(X_train, y_train, params, n_days, items):
    """
    Trains multiple XGBoost models to predict sales and returns feature importances.

    Args:
    X_train: Training feature set.
    y_train: Training target variable.
    params: Parameters for the XGBoost model.
    n_days: Number of days for prediction.
    items: Dataset for additional weights.

    Returns:
    List of feature importances from each trained model.
    """

    # Setting up XGBoost parameters for Random Forest-like behavior
    params.update({
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'gpu_hist',
        'eta': 1  # Learning rate
    })

    num_boost_rounds = 1  # Number of boosting rounds
    feature_importances_all = []  # Store feature importances from all models

    for day in range(16):
        print("=" * 50)
        print(f"Training step: {day + 1}")
        print("=" * 50)

        # Preparing the training data
        weights = pd.concat([items["perishable"]] * n_days) * 0.25 + 1
        dtrain = xgb.DMatrix(X_train, label=y_train[:, day], weight=weights)
        watchlist = [(dtrain, 'train')]

        # Training the model
        model = xgb.train(params, dtrain, num_boost_rounds, watchlist, verbose_eval=1)

        # Extracting and sorting feature importances
        feature_importances = model.get_score(importance_type='gain')
        sorted_feature_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

        feature_importances_all.append(sorted_feature_importances)

        # Freeing up memory
        del model, dtrain, feature_importances

    return feature_importances_all


### Feature Selection Using Random forest

In [None]:
%%time

# Initialize parameters for the Random Forest model
params = {
    'max_depth': 15,  # Maximum depth of a tree
    'num_parallel_tree': 100  # Number of trees to grow per round
}

# Define the number of days for the prediction
n_days = 2

# Call the RandomForest_FeatureSelection function
# to get sorted feature importances for all models
feature_imp = RandomForest_FeatureSelection(X_train, y_train, params, n_days, items)


Step 1
[0]	train-rmse:0.502756
Step 2
[0]	train-rmse:0.517825
Step 3
[0]	train-rmse:0.51848
Step 4
[0]	train-rmse:0.535173
Step 5
[0]	train-rmse:0.542377
Step 6
[0]	train-rmse:0.548207
Step 7
[0]	train-rmse:0.541883
Step 8
[0]	train-rmse:0.522936
Step 9
[0]	train-rmse:0.537185
Step 10
[0]	train-rmse:0.530408
Step 11
[0]	train-rmse:0.551563
Step 12
[0]	train-rmse:0.555245
Step 13
[0]	train-rmse:0.554349
Step 14
[0]	train-rmse:0.543721
Step 15
[0]	train-rmse:0.530702
Step 16
[0]	train-rmse:0.540752
CPU times: user 33min 35s, sys: 15min 46s, total: 49min 22s
Wall time: 49min 24s


In [None]:
top = 300

# Using list comprehension for a more concise implementation
filtered_features = [
    [feature[0] for feature in feature_imp[model][:top]] for model in range(16)
]

# Using f-string for more modern string formatting
print(f"Filtered top {len(filtered_features[0])} features")


Filtered top 300 features


In [None]:
import pickle
#Saving feature importance
with open('300_filtered_features.pkl','wb') as file:
    pickle.dump(filtered_features, file)