In [1]:
import numpy as np
import pandas as pd

In [2]:
import mlflow
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from sklearn.feature_selection import RFECV
from xgboost import XGBRegressor, XGBRFRegressor
from feature_engine.timeseries.forecasting import LagFeatures, WindowFeatures, ExpandingWindowFeatures
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SmartCorrelatedSelection
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator

In [3]:
import logging
from typing import Annotated, Tuple, Dict, List, Union

In [4]:
from lightgbm import LGBMRegressor

<center><b>Process Data</b></center>

In [5]:
def ingest_data(data_source: Annotated[str, 'data_source']) -> Annotated[pd.DataFrame, 'data']:
    """
    Ingests data from a given path.

    Args:
        data_source: The path to the data.

    Returns:
        The data as a string.
    """
    try:
        logging.info(f"Reading data from {data_source}")
        data = pd.read_parquet(data_source)
        logging.info(f"Data read from {data_source}")
        return data
    except Exception as e:
        logging.error(f"Error reading data from {data_source}: {e}")
        raise e

In [6]:
def clean_data(data: Annotated[pd.DataFrame, 'data']) -> Annotated[pd.DataFrame, 'cleaned_data']:
    """
    Clean the data by removing duplicates, null values, and converting columns to appropriate types.

    Args:
        data (pd.DataFrame): The input data.

    Returns:
        pd.DataFrame: The cleaned data. None if an error occurs.

    """
    try:
        logging.info("Cleaning data...")
        data.drop_duplicates(keep='last', inplace=True)
        data.dropna(inplace=True)
        data.drop(columns=['client_id', 'CID', 'Base Size'], inplace=True)
        
        # format the date time
        data['date'] = pd.to_datetime(data.date).values

        # Sort
        data.sort_values(by='date', inplace=True)

        # renaming cols
        data.columns = [col.lower().strip().replace(' ', '_')
                        for col in data.columns]
        data.rename(
            {'area_(km)^2': 'area_km2', 'population_(approx.)': 'population',
             'literacy_rate_(%)': 'literacy_rate_perc'},
            axis=1, inplace=True)

        # optimizing for memory
        for col in data.select_dtypes('float64').columns:
            data[col] = data[col].astype('float32')

        for col in data.select_dtypes('int64').columns:
            data[col] = data[col].astype('int32')

        # lType conversion
        data['literacy_rate_perc'] = data.literacy_rate_perc.astype('float32')
        data['kpi'] = data.kpi.astype('float16')
        data['tmtm'] = data.tmtm.astype('float32')

        # rename date -> timestamp
        data.rename({'date': 'timestamp'}, axis=1, inplace=True)

        logging.info("Data cleaned.")
        return data
    except Exception as e:
        logging.error(f"Error cleaning data: {e}")
        raise e

In [7]:
def encode_and_aggregate_data(
    data: Annotated[pd.DataFrame, 'cleaned_data']
) -> Tuple[Annotated[pd.DataFrame, 'target'], Annotated[pd.DataFrame, 'static features'], Annotated[pd.DataFrame, 'aggregated application_group table'], Annotated[pd.DataFrame, 'aggregated uses tabe'], Annotated[pd.DataFrame, 'aggregated mkt table']]:
    """
    Encode categorical features of the data.

    Args:
        data: Dataframe containing the categorical features.

    Returns:
        Dataframe containing the encoded categorical features OR None.

    """
    try:
        logging.info('Encoding categorical features...')
        
        # HASH FEATURES category
        data['category'] = data.category.apply(
            lambda cat: {'Domestic': 1, 'Power': 0}[cat])
        data['grade'] = data.grade.apply(
            lambda cat: {'Grade1': 1, 'Grade2': 2,
                         'Grade3': 3, 'Grade4': 4}[cat]
        )
        data['ecoind'] = data.ecoind.apply(
            lambda cat: {'Medium': 2, 'High': 4, 'Low': 2, 'Poor': 1}[cat]
        )

        # OneHot Encoding
        data = pd.get_dummies(data, columns=['division', 'region'])
        
        # renaming cols
        data.columns = [col.lower().strip().replace(' ', '_')
                        for col in data.columns]
        
        for column in data.select_dtypes('bool').columns:
            data[column] = data[column].astype(int)
    

        # optimize for memory
        for col in data.select_dtypes('int64').columns:
            data[col] = data[col].astype('int32')

        # Aggregate targets by outlet_id
        targets = data.pivot_table(index=['timestamp','outlet_id'], aggfunc={
                'net_price': 'mean',
                'qtym': 'mean',
            }
        ).reset_index()
        
        # Aggreate static feature by outlet_id
        static_features = data[['timestamp','outlet_id','wire', 'rm',
           'fy', 'grade', 'noc',
           'dfc', 'area_km2', 'population', 'literacy_rate_perc', 'pcx', 'excnts',
           'exach', 'trc', 'tlcolt', 'tmtm', 'ecoind', 'sf', 'sop', 'pminx',
           'tms_cr', 'mas', 'kpi', ]].groupby(by=['timestamp','outlet_id'],).mean().reset_index()
        
        # aggreated appliatin group by outlet_id
        application_group = pd.DataFrame(columns=['General', 'Moderate', 'Rich', 'Industry'])
        for outlet in data.outlet_id.value_counts().index:
            ratio = data.loc[data.outlet_id==outlet, 'application_group'].value_counts(normalize=True).to_dict()
            application_group.loc[outlet] = ratio
        application_group = application_group.fillna(0).reset_index().rename(columns={'index':'outlet_id'}).astype(np.float32)
        
        # Aggregated uses by outlet_id
        uses = pd.DataFrame(columns=[
            'House Wiring', 'Fan & Lighting Connection',
            'Air Condition & Washing Machine, Heavy Item', 'Lift & Heavy Item',
            'Earthing', 'Industry, Machineries'
            ]
        )
        for outlet in data.outlet_id.value_counts().index:
            ratio = data.loc[data.outlet_id==outlet, 'uses'].value_counts(normalize=True).to_dict()
            uses.loc[outlet] = ratio
        uses = uses.fillna(0).reset_index().rename(columns={'index':'outlet_id'}).astype(np.float32)
        
        # Aggregated mkt ratio by outlet_id
        mkt = pd.DataFrame(columns=
            ['Urban', 'Rural', 'Semi Urban', 'Others']
        )
        for outlet in data.outlet_id.value_counts().index:
            ratio = data.loc[data.outlet_id==outlet, 'mkt'].value_counts(normalize=True).to_dict()
            mkt.loc[outlet] = ratio
        mkt = mkt.fillna(0).reset_index().rename(columns={'index':'outlet_id'}).astype(np.float32)
        logging.info('Encoding categorical features completed.')
        return targets, static_features, application_group, uses, mkt
    except Exception as e:
        logging.error(f'Error encoding categorical features: {e}')
        return None

In [8]:
def AddTemporalFeatures(targets: Annotated[pd.DataFrame, 'encoded data']) -> Annotated[pd.DataFrame, 'temporal features']:
    features_to_extract = [
        "month", "quarter", "semester", "week", "day_of_week", "day_of_month",
        "day_of_year", "weekend", "month_start", "month_end", "quarter_start",
        "quarter_end", "year_start", "year_end"
    ]

    try:
        logging.info(f'==> Processing AddTemporalFeatures()')
        temporal = DatetimeFeatures(
            features_to_extract=features_to_extract).fit_transform(targets[['timestamp']])
        # for col in temporal.columns:
        #     data.loc[:, col] = temporal[col].values
        logging.info(f'==> Successfully processed AddTemporalFeatures()')
        return temporal
    except Exception as e:
        logging.error(f'==> Error in AddTemporalFeatures()')
        raise e

In [9]:
def AddLagFeatures(targets: Annotated[pd.DataFrame, 'after added temporal features']) -> Annotated[pd.DataFrame, 'Lag features']:
    """
    Add lag features to the data.
    """
    logging.info(f"Adding lag features to the data.")
    try:
        # Add Lag  Feature
        lagfeatures = LagFeatures(variables=None, periods=[3, 8, 16, 24], freq=None, sort_index=True,
                                  missing_values='raise', drop_original=False)
        lagfeatures.fit(targets[['timestamp', 'net_price', 'qtym']])
        features = lagfeatures.transform(
            targets[['timestamp', 'net_price', 'qtym']])
        # for col in list(features.columns)[3:]:
        #     data[col] = features[col].values
        logging.info(f'==> Successfully processed add_lag_features()')
        return features.drop(['timestamp', 'net_price', 'qtym'], axis=1)
    except Exception as e:
        logging.error(f'in The add_lag_features(): {e}')
        raise e

In [10]:
def AddWindowFeatures(targets: Annotated[pd.DataFrame, 'After lag features added']) -> Annotated[pd.DataFrame, 'window features']:
    """Add window features to the dataframe

    Args:
        data (Union[dd.DataFrame, pd.DataFrame]): The dataframe to add window features to.

    Returns:
        Union[dd.DataFrame, pd.DataFrame]: The dataframe with window features added.
    """
    logging.info("Adding window features to the dataframe")

    try:
        windowfeatures = WindowFeatures(variables=None, window=24, freq=None, sort_index=True,
                                        missing_values='raise', drop_original=False)
        windowfeatures.fit(
            targets[['timestamp', 'net_price', 'qtym']])
        features = windowfeatures.transform(
            targets[['timestamp', 'net_price', 'qtym']])
        # for col in list(features.columns)[3:]:
        #     data[col] = features[col].values
        logging.info(f'==> Successfully processed add_window_features()')
        return features.drop(['timestamp', 'net_price', 'qtym'], axis=1)
    except Exception as e:
        logging.error(f'in add_window_features(): {e}')
        raise e

In [11]:
def AddExpWindowFeatures(targets: Annotated[pd.DataFrame, 'after added temporal features']) -> Annotated[pd.DataFrame, 'added Expanding Window features']:
    """Add Expanding Window Features to the data.
    Args:
        data (pd.DataFrame): The input data.
    Returns:
        pd.DataFrame: The data with added expanding window features.
    """
    try:

        expwindow = ExpandingWindowFeatures(
            variables=None, min_periods=7, functions='std', 
            periods=7, freq=None, sort_index=True, 
            missing_values='raise', drop_original=False
        )
        features = expwindow.fit_transform(targets[['timestamp', 'net_price', 'qtym']])
        
        # # 
        # for col in list(features.columns)[3:]:
        #     data[col] = features[col].values
        return features.drop(['timestamp', 'net_price', 'qtym'], axis=1)
    except Exception as e:
        logging.error(f'in The add_expw_features(): {e}')
        raise e

In [12]:
def merge_all_features(
    targets: Annotated[pd.DataFrame, 'targets'], 
    static_features: Annotated[pd.DataFrame, 'static_features'],
    application_group: Annotated[pd.DataFrame, 'application_group'],
    uses: Annotated[pd.DataFrame, 'uses'],
    mkt: Annotated[pd.DataFrame, 'mkt'],
    
) -> Tuple[Annotated[pd.DataFrame, 'features'], Annotated[pd.Series, 'target'], Annotated[BaseEstimator, 'imputer']]:
    """Merges All Features into One.
    Args:
        data (pd.DataFrame): The input data.
    Returns:
        pd.DataFrame: The data with added expanding window features.
    """
    try:
        logging.info(f'==> merging features...')
        
        # Generate outlet wise timeseries_features
        timeseries_features_outlet_wise = pd.DataFrame()
        for outlet_id in targets.outlet_id.value_counts().index:
            outlet_wise = targets.loc[targets.outlet_id==outlet_id]
            temporal = AddTemporalFeatures(outlet_wise)
            lag_features = AddLagFeatures(outlet_wise)
            window_features = AddWindowFeatures(outlet_wise)
            exp_window_features = AddExpWindowFeatures(outlet_wise)
            outlet_wise_features = pd.concat([outlet_wise[['timestamp','outlet_id',]], temporal, lag_features, window_features, exp_window_features], axis=1)
            timeseries_features_outlet_wise = pd.concat([timeseries_features_outlet_wise, outlet_wise_features], ignore_index=True)
        
        # Merge outlet wise timeseries_features
        targets.merge(timeseries_features_outlet_wise, on=['timestamp','outlet_id',], how='inner')

        
        # Merge application group, uses, mkt
        data = targets.merge(
            application_group, on='outlet_id', how='inner'
        ).merge(
            uses, on='outlet_id', how='inner'
        ).merge(
            mkt, on='outlet_id', how='inner'
        ).merge(
            static_features, on=['timestamp', 'outlet_id'], how='inner'
        ).merge(
            timeseries_features_outlet_wise, on=['timestamp', 'outlet_id'], how='inner')
        
        # Impute Missing Values
        target = data['qtym']
        features = data.drop(columns=['timestamp', 'net_price', 'qtym'])
        imputer = KNNImputer(n_neighbors=5).fit(features)
        features = pd.DataFrame(imputer.transform(features), columns=features.columns)
        features['timestamp'] = data.timestamp
        del data
        return features, target, imputer
    except Exception as e:
        logging.error(f'==> Error when merging features: {e}')
        raise e

In [13]:
def scale_data(features: Annotated[pd.DataFrame, 'features to scale'],) -> Annotated[pd.DataFrame, 'standardized features']:
    """Scaling step.
    Args:
        data: Input data.
    Returns:
        Normalized data.
    """
    try:
        logging.info(f'==> Processing scale_data()')
        # Assuming the data is a pandas DataFrame
        scaler = RobustScaler(
            with_centering=True,
            with_scaling=True,
            quantile_range=(25.0, 75.0),
            copy=True,
            unit_variance=False,
        )
        scaler.fit(features)
        features = pd.DataFrame(scaler.transform(features), columns=features.columns)
        scaler.fit(features)
        
#         # save Scaler model
#         joblib.dump(scaler, os.path.join(config.ARTIFACTS_DIR, 'scaler.pkl'))
#         logger.info(
#             f'Scaler model saved to {os.path.join(config.ARTIFACTS_DIR, "scaler.pkl")}')
#         features.to_parquet(config.FEATURE_STORE, index=False)
        return features
    except Exception as e:
        logging.error(f"in scale_data(): {e}")
        raise e

In [14]:
def split_data(
    features: Annotated[pd.DataFrame, 'features'],
    target: Annotated[pd.Series, 'target'],
    test_size: float = 0.25,
    random_state: int = 33
) -> Tuple[Annotated[pd.DataFrame, 'X_train'], Annotated[pd.DataFrame, 'X_test'], Annotated[pd.Series, 'y_train'], Annotated[pd.Series, 'y_test']]:
    """
    Split the data into train and test sets.

    Args:
        features (pd.DataFrame): The input data.
        target (pd.Series) : Target colun
        test_size (float): The proportion of the data to include in the test set. Default is 0.2.
        random_state (int): The seed for the random number generator. Default is 42.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: The train and test sets.
    """
    logging.info("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=test_size, random_state=random_state)
    logging.info("Data split successfully.")
    return X_train, X_test, y_train, y_test

In [15]:
df = ingest_data(data_source='/home/skhapijulhossen/Desktop/demand-forecast-SQGroup/data/sales_bya.parquet')
df = clean_data(data=df)
targets, static_features, application_group, uses, mkt = encode_and_aggregate_data(data=df)
features, target, imputer = merge_all_features(targets, static_features, application_group, uses, mkt)
X_train, X_test, y_train, y_test = split_data(features=features, target=target)

In [16]:
static_features.groupby(by='outlet_id').mean()[['grade']].reset_index()

Unnamed: 0,outlet_id,grade
0,1001,1.237182
1,1002,1.195289
2,1003,1.352732
3,1004,1.359670
4,1005,1.179050
...,...,...
56,3746,1.302701
57,4151,1.216001
58,4385,1.127496
59,7528,1.081250


In [17]:
df.grade.value_counts(normalize=True)

grade
1    0.821150
2    0.152982
3    0.022640
4    0.003227
Name: proportion, dtype: float64

In [34]:
application_group.to_parquet('application_group.parquet', index=False)
uses.to_parquet('uses.parquet', index=False)
mkt.to_parquet('mkt.parquet', index=False)
static_features.groupby(by='outlet_id').mean()[['ecoind', 'noc']].reset_index().to_parquet('ecoind_noc.parquet', index=False)

In [110]:
X_train.shape, y_train.shape

((27681, 64), (27681,))

In [111]:
X_test.shape, y_test.shape

((9227, 64), (9227,))

In [112]:
features['target'] = target

In [47]:
sm = SmartCorrelatedSelection(variables=None, method='pearson', threshold=0.8, missing_values='ignore', selection_method='missing_values', estimator=None, scoring='r2', cv=5, confirm_variables=False)
best = sm.fit_transform(X_train, y_train)
best.columns

Index(['Rich', 'Industry', 'House Wiring', 'Fan & Lighting Connection',
       'Air Condition & Washing Machine, Heavy Item', 'Earthing',
       'Industry, Machineries', 'Urban', 'Rural', 'Semi Urban', 'Others',
       'wire', 'fy', 'grade', 'noc', 'dfc', 'area_km2', 'literacy_rate_perc',
       'pcx', 'excnts', 'exach', 'trc', 'tlcolt', 'tmtm', 'ecoind', 'sf',
       'sop', 'pminx', 'tms_cr', 'mas', 'kpi', 'timestamp_quarter',
       'timestamp_day_of_week', 'timestamp_day_of_month',
       'timestamp_month_start', 'timestamp_month_end',
       'timestamp_quarter_start', 'timestamp_quarter_end',
       'timestamp_year_start', 'timestamp_year_end', 'net_price_lag_3',
       'qtym_lag_8', 'net_price_lag_16', 'qtym_lag_24', 'qtym_window_24_mean',
       'qtym_expanding_std'],
      dtype='object')

In [48]:
selector = LGBMRegressor()
selector.fit(X_train.values, y_train)
selector.feature_importances_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008076 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5900
[LightGBM] [Info] Number of data points in the train set: 27681, number of used features: 63
[LightGBM] [Info] Start training from score 526.995830


array([ 63,  46,  35, 117,  21,  27,  32,   0,  15,  49,  24,  34,  31,
        28,  45,  21,  69,  17,  12,  11,  29,  38,  20,  36,  52,  49,
        69,  36,  24, 726,   1,   6,  12,  10, 220, 287,  23,  11,   0,
         0,  16,   7,  25,  13,   0,   0,   0,   0,   0,   0,   0,  83,
        53,  34,  27,  92,  59,  56,  24,  50,  33,  43,  39], dtype=int32)

In [49]:
selected = selector.feature_importances_ > 0
np.where(selected)[0]

array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 40, 41, 42, 43, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62])

In [50]:
best_features = X_train.columns[np.where(selected)[0]]
best_features

Index(['outlet_id', 'General', 'Moderate', 'Rich', 'Industry', 'House Wiring',
       'Fan & Lighting Connection', 'Lift & Heavy Item', 'Earthing',
       'Industry, Machineries', 'Urban', 'Rural', 'Semi Urban', 'Others',
       'wire', 'rm', 'fy', 'grade', 'noc', 'dfc', 'area_km2', 'population',
       'literacy_rate_perc', 'pcx', 'excnts', 'exach', 'trc', 'tlcolt', 'tmtm',
       'ecoind', 'sf', 'sop', 'pminx', 'tms_cr', 'mas', 'kpi',
       'timestamp_month', 'timestamp_week', 'timestamp_day_of_week',
       'timestamp_day_of_month', 'timestamp_day_of_year', 'net_price_lag_3',
       'qtym_lag_3', 'net_price_lag_8', 'qtym_lag_8', 'net_price_lag_16',
       'qtym_lag_16', 'net_price_lag_24', 'qtym_lag_24',
       'net_price_window_24_mean', 'qtym_window_24_mean',
       'net_price_expanding_std', 'qtym_expanding_std'],
      dtype='object')

In [92]:
best_features = list(set(best.columns).intersection(set(best_features)))
features.shape

(36908, 64)

In [86]:
from sklearn.decomposition import KernelPCA, PCA

def decompose_features_kernel_pca(X, n_components=2, kernel='linear', load=False):
    """
    Decomposes features using PCA.

    Args:
    - X (array-like): Input feature matrix.
    - n_components (int): Number of components (default: 2).

    Returns:
    - X_transformed (array-like): Transformed feature matrix.
    """
    if load:
        pass
    
    # Initialize KernelPCA object
    pca = PCA(n_components=n_components)
    
    # Fit and transform the input feature matrix
    X_transformed = pca.fit_transform(X)

    return X_transformed, pca


<center><b> Experiments </b></center>

In [25]:
def evaluate_model(
    model,
    X: pd.DataFrame,
    y: pd.DataFrame,
) -> Annotated[float, 'r2_score']:
    """
    Evaluate the model
    """
    try:
        y_pred = model.predict(X).reshape(y.shape[0], 1)

        # MAPE, MSE, RMSE, R2, AIC, BIC
        mape = mean_absolute_percentage_error(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        mlflow.log_metrics(dict(mape=mape, mse=mse, r2=r2))
        return r2
    except Exception as e:
        raise e

In [26]:
# mlflow.create_experiment('Net Price Forecasting')

In [28]:
mlflow.set_experiment('Net Price Forecasting')
with mlflow.start_run() as pr:
    XGB_PARAMS_SPACE = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.3, 0.5],
        'n_estimators': [100, 200, 300],
        'subsample': [0.5, 0.7, 0.9],
        'colsample_bytree': [0.5, 0.7, 0.9],
        'gamma': [0, 0.1, 0.2, 0.01]
    }
#     with mlflow.start_run(run_name="XGBoost Run", nested=True) as xgb_run:
#         # Randomized Grid Search for XGBoost hyperparameters
#         grid = RandomizedSearchCV(
#             XGBRFRegressor(),
#             param_distributions=XGB_PARAMS_SPACE,
#             scoring='r2',
#             cv=5,
#             verbose=0
#         )
#         grid.fit(X_train[best_features].values, y_train)
#         model = grid.best_estimator_
#         mlflow.log_params(grid.best_params_)
#         mlflow.log_param('components', len(best_features))
#         mlflow.log_param('columns', best_features)
#         ### log metrics
#         r2 = evaluate_model(grid.best_estimator_, X_test[best_features].values, y_test)
#         if r2 > 0.7:
#             mlflow.xgboost.log_model(model, f'XGB-QTYM')
    
    LGB_PARAMS_SPACE = {
        'boosting_type': ['gbdt', 'dart', 'goss'],
        'num_leaves': [20, 30, 40, 50, 60],
        'max_depth': [5, 7, 9, 11, 15, 21],  # -1 means no limit
        'learning_rate': [0.01, 0.05, 0.1, 0.3],
        'n_estimators': [50, 100, 200, 300],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'reg_alpha': [0.0, 0.1, 0.5, 1.0, 0.01],
        'reg_lambda': [0.0, 0.1, 0.5, 1.0],
        'min_child_samples': [20, 30, 40, 50, 60],
        'random_state': [42]  # Ensure reproducibility
    }
    with mlflow.start_run(run_name="LGB Run", nested=True) as lgb_run:
        # Randomized Grid Search for XGBoost hyperparameters
        grid = RandomizedSearchCV(
            LGBMRegressor(),
            param_distributions=LGB_PARAMS_SPACE,
            scoring='r2',
            cv=5,
            verbose=0
        )
        grid.fit(X_train[best_features].values, y_train)
        model = grid.best_estimator_
        mlflow.log_params(grid.best_params_)
        mlflow.log_param('components', len(best_features))
        mlflow.log_param('columns', best_features)

        ### log metrics
        r2 = evaluate_model(grid.best_estimator_, X_test[best_features].values, y_test)
        if r2 > 0.7:
            mlflow.lightgbm.log_model(model, f'LGB-QTYM-{r2}')

       


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001808 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3419
[LightGBM] [Info] Number of data points in the train set: 22144, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 526.203339
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001947 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3415
[LightGBM] [Info] Number of data points in the train set: 22145, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 532.217625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001906 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3418
[LightGBM] [Info] Number of data points in the train set: 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001816 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3419
[LightGBM] [Info] Number of data points in the train set: 22144, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 526.203339
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3415
[LightGBM] [Info] Number of data points in the train set: 22145, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 532.217625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3418
[LightGBM] [Info] Number of data points in the train set: 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002719 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3418
[LightGBM] [Info] Number of data points in the train set: 22145, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 528.202506
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001990 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3409
[LightGBM] [Info] Number of data points in the train set: 22145, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 524.829758
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001937 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3451
[LightGBM] [Info] Number of data points in the train set: 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000605 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3419
[LightGBM] [Info] Number of data points in the train set: 22144, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 526.203339
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3415
[LightGBM] [Info] Number of data points in the train set: 22145, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 532.217625
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000963 seconds.
You can set `force_row

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000617 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3409
[LightGBM] [Info] Number of data points in the train set: 22145, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 524.829758
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000596 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3451
[LightGBM] [Info] Number of data points in the train set: 22145, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 523.525887
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000581 seconds.
You can set `force_row

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3415
[LightGBM] [Info] Number of data points in the train set: 22145, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 532.217625
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3418
[LightGBM] [Info] Number of data points in the train set: 22145, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 528.202506
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_row

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001993 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3451
[LightGBM] [Info] Number of data points in the train set: 22145, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 523.525887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002007 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3419
[LightGBM] [Info] Number of data points in the train set: 22144, number of used features: 38
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 526.203339
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3415
[LightGBM] [Info] Number of data points in the train set: 

In [113]:
import mlflow


In [114]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: LGB-QTYM-0.7456832147065215
  flavor: mlflow.lightgbm
  run_id: ae501105c42f4987b526ea740568507a

In [115]:
featuresN = ['net_price_lag_16', 'Earthing', 'Urban', 'Others', 'grade', 'pminx', 'excnts', 'Fan & Lighting Connection', 'kpi', 'sop', 'Rural', 'exach', 'qtym_window_24_mean', 'Rich', 'tmtm', 'Semi Urban', 'tlcolt', 'timestamp_day_of_month', 'sf', 'mas', 'noc', 'timestamp_day_of_week', 'fy', 'ecoind', 'tms_cr', 'literacy_rate_perc', 'area_km2', 'qtym_expanding_std', 'qtym_lag_24', 'pcx', 'Industry', 'dfc', 'House Wiring', 'wire', 'net_price_lag_3', 'Industry, Machineries', 'qtym_lag_8', 'trc']

In [116]:
features[featuresN+['target', 'outlet_id', 'timestamp']].to_parquet('processed.parquet')

In [118]:
df = pd.read_parquet('processed.parquet')
df.loc[df.outlet_id==1001, featuresN].shape

(829, 38)

In [119]:
def load_data_by_outlet(outlet_id, DATA):
    # Assuming your model has a function to load data based on outlet_id
    filtered = DATA.loc[DATA.outlet_id==outlet_id]
    return filtered[best_features], filtered['target']

In [120]:
x, y = load_data_by_outlet(1001, df)

In [121]:
loaded_model.predict(df[best_features])



array([1589.31018984,  200.3760891 ,  879.93367635, ...,   98.30341263,
        577.34312924,  143.15133717])

In [122]:
df.target

0        1596.651245
1         199.931656
2         876.108337
3        2332.785156
4         977.241394
            ...     
36903     646.000000
36904     157.666672
36905      94.000000
36906     593.571411
36907     150.000000
Name: target, Length: 36908, dtype: float32

In [77]:
type(loaded_model)

mlflow.pyfunc.PyFuncModel

In [79]:
loaded_model.feature_importances_

AttributeError: 'PyFuncModel' object has no attribute 'feature_importances_'

In [82]:
LGBMRegressor().fit(x.values, y).feature_importances_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1919
[LightGBM] [Info] Number of data points in the train set: 829, number of used features: 13
[LightGBM] [Info] Start training from score 755.026792


array([ 124,    0,    0,    0,   66,    0,    0,    0,    0,    0,    0,
          0,  142,    0, 1482,    0,    0,  146,    0,    0,    0,   50,
         28,    0,    0,    0,    0,  241,  105,    8,    0,    0,    0,
         85,  385,    0,  127,    0], dtype=int32)

In [None]:
selected = selector.feature_importances_ > 0
np.where(selected)[0]