In [1]:
# !pip install

In [2]:
import warnings

warnings.filterwarnings("ignore")
import logging
import joblib
# import pysnooper
from typing import List, Tuple
import numpy as np
import pandas as pd
from scipy import stats
import geopandas as gpd
from shapely.geometry import Point

from matplotlib import pyplot as plt
import seaborn as sns

import mlflow
import torch
import lightning as L
from torch import nn
from lightning.pytorch import seed_everything
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from torchmetrics import ExplainedVariance, MeanAbsoluteError, MeanSquaredError, MeanAbsolutePercentageError, R2Score
# from lightning.pytorch.loggers import MLFlowLogger

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
device = torch.device(
    'mps' if torch.backends.mps.is_available() else 'cuda' if torch.backends else 'cpu'
)

# 1. Data Engineering

In [3]:
df = pd.read_json('../../data/train_data.json')
df.head()

Unnamed: 0,hash,targetAudience,points,value
0,41567f28db47bee7,"{'name': 'All 25-45 BC', 'gender': 'all', 'age...","[{'lat': '55.573691', 'lon': '37.631423', 'azi...",23.51
1,94b6df335598a161,"{'name': 'W 30-60', 'gender': 'female', 'ageFr...","[{'lat': '55.656665886902', 'lon': '37.7408534...",1.2
2,2ef4e73f0d2c51d0,"{'name': 'W 30-60', 'gender': 'female', 'ageFr...","[{'lat': '55.680953807163', 'lon': '37.6644265...",4.65
3,b3ebb77f965de304,"{'name': 'W 30+ BC', 'gender': 'female', 'ageF...","[{'lat': '55.741072317672', 'lon': '37.6526972...",32.09
4,76059b6cbb303166,"{'name': 'W 30+ BC', 'gender': 'female', 'ageF...","[{'lat': '55.656665886902', 'lon': '37.7408534...",26.12


## Clusterizer

In [7]:
class Clusterizer:
    def __init__(self):
        self.config = {'min_xval': 55.55, 'max_xval': 55.95, 'min_yval': 37.3, 'max_yval': 37.9, 'x_ngroups': 5, 'y_ngroups': 5}
        # get x-axis grid
        self.x_intervals = self.split_on_intervals(
            self.config['min_xval'], self.config['max_xval'], self.config['x_ngroups']
        )
        # get y-axis grid
        self.y_intervals = self.split_on_intervals(
            self.config['min_yval'], self.config['max_yval'], self.config['y_ngroups']
        )
        # get 2-d grid
        self.groups = self.create_groups(self.x_intervals, self.y_intervals)
        self.n_groups = len(self.groups)
    
    def split_on_intervals(self, min_val, max_val, n):
        # Делит отрезок на равные интервалы
        step = (max_val - min_val) / n
        intervals = [min_val + (step * x) for x in range(n + 1)]
        return intervals
    
    @classmethod
    def create_groups(cls, x_intervals, y_intervals):
        #Создает регионы для поля
        groups = {}
        x_intervals = np.concatenate([[-np.inf], x_intervals, [np.inf]])
        y_intervals = np.concatenate([[-np.inf], y_intervals, [np.inf]])
    
        for x_i in range(len(x_intervals) - 1):
            for y_i in range(len(y_intervals) - 1):
                groups[
                    f'x : {x_intervals[x_i]} - {x_intervals[x_i + 1]} | y : {y_intervals[y_i]} - {y_intervals[y_i + 1]}'] = 0
    
        return groups
    
    @classmethod
    def sort_on_groups(cls, x_vals, y_vals, x_intervals, y_intervals, groups, only_vals=False):
        # Сортирует точки по регионам
        for x, y in zip(x_vals, y_vals):
            for x_i in range(len(x_intervals) - 1):
                for y_i in range(len(y_intervals) - 1):
                    if ((x_intervals[x_i] <= x < x_intervals[x_i + 1]) and
                            (y_intervals[y_i] <= y < y_intervals[y_i + 1])):
                        groups[
                            f'x : {x_intervals[x_i]} - {x_intervals[x_i + 1]} | y : {y_intervals[y_i]} - {y_intervals[y_i + 1]}'] += 1
        
        if only_vals:
            return list(groups.values())
        return groups
    
    def clusters_centres(self):
        pass
    
    def cluster_statistics(self, df, cluster_centre = None):
        if not cluster_centre:
            cluster_centre = (df['lat'].mean(), df['lon'].mean())
    
        # Calculate statistics
        cluster_stats = {
            'mean_lat': df['lat'].mean(),
            'mean_lon': df['lon'].mean(),
            'count': df.shape[0],
            # features relative to cluster centre
            'c_mean_lat_trimmed': stats.trim_mean(df['lat'] - cluster_centre[0], 0.1),
            'c_mean_lon_trimmed': stats.trim_mean(df['lon'] - cluster_centre[1], 0.1),
            'c_std_lat': np.std(df['lat'] - cluster_centre[0]),
            'c_std_lon': np.std(df['lon'] - cluster_centre[1]),
            #todo: add distance across 1st principal component
        }
        return pd.DataFrame([cluster_stats])

    def apply_cluster_statistics(self, row: List) -> pd.DataFrame:
        """
        Calculates statistics for all clusters, samples of which are present in `row`
        :param row: list of dictionaries of coordinates
        :return: DataFrame with new statistics
        """
        # iterate over every geo-point and assign it to a cluster
        groups = self.groups
        for key in groups:
            groups[key] = []
    
        points = np.array([[float(x['lat']), float(x['lon'])] for x in row])
        for x, y in zip(points[:, 0], points[:, 1]):
            for x_i in range(len(self.x_intervals) - 1):
                for y_i in range(len(self.y_intervals) - 1):
                    if ((self.x_intervals[x_i] <= x < self.x_intervals[x_i + 1]) and
                            (self.y_intervals[y_i] <= y < self.y_intervals[y_i + 1])):
                        group_name = f'x : {self.x_intervals[x_i]} - {self.x_intervals[x_i + 1]} | y : {self.y_intervals[y_i]} - {self.y_intervals[y_i + 1]}'
                        groups[group_name].append((x, y))

        # iterate over every cluster and calculate statistics
        group_stats = pd.DataFrame()
        for idx, group in enumerate(groups):
            points = groups[group]
            if points:  # if group isn't empty
                group_stats = pd.concat(
                    [
                        group_stats,
                        (
                            self.cluster_statistics(pd.DataFrame(points, columns=['lat', 'lon']))
                            .rename(columns=lambda x: f'cluster_{idx}_{x}')
                        )
                    ], axis=1
                )
        if len(group_stats):
            return group_stats.iloc[0]
        else:
            logger.info("No stats gathered")
            return None
    
    def assign_clusters(self, row: List) -> List:
        """
        Returns number of entries per cluster
        :param row: List with points dictionaries
        :return: List with number of geo-points per cluster
        """
        points = np.array([[float(x['lat']), float(x['lon'])] for x in row])
        group_values = self.sort_on_groups(
            points[:, 0], points[:, 1], self.x_intervals, self.y_intervals, self.groups.copy(), only_vals=True
        )
        
        return group_values


In [8]:
clusterizer = Clusterizer()

## Data Preprocessor

In [9]:
class DataPreprocessor:
    def __init__(self, clusterizer):
        self.moscow_centre_coordinates = [55.751244, 37.618423]
        self.clusterizer = clusterizer
    
    @staticmethod
    def calculate_distances(row, centre_coordinates: List[float]) -> List[float]:
        distances = [
            float(gpd.GeoSeries(Point(float(point['lat']), float(point['lon']))) \
                  .distance(Point(centre_coordinates))) for point in row
        ]
        return distances
    
    def _add_distances_function_column(
            self,
            points_col: pd.Series,
            centre_coordinates: List[float],
            func = np.mean,
            **kwargs
    ):
        x = points_col.apply(
            lambda row: func(self.calculate_distances(row, centre_coordinates), **kwargs)
        )
        return x

    def apply_clusters_distribution(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Receives pd.DataFrame with column `points` as input and calculates number of times every cluster is met in the list of clusters prescribed to provided `points`.
        :param df: pd.DataFrame with column `points`
        :return: modified pd.DataFrame with additional columns: number of occurrences of every cluster, and other inter-cluster metrics.
        """
        # cluster frequency in every entry
        cluster_columns = [f'cluster_{i}' for i in range(self.clusterizer.n_groups)]
        df_clusters_freq = df.apply(
            lambda row: self.clusterizer.assign_clusters(row['points']), axis=1
        )
        df_clusters = pd.DataFrame(df_clusters_freq.tolist(), columns=cluster_columns)
        
        # inter-cluster features
        df_clusters_stats = df.apply(
            lambda row: self.clusterizer.apply_cluster_statistics(row['points']), axis=1
        )
        return pd.concat([df, df_clusters, df_clusters_stats], axis=1) #df_clusters_stats
    
    def msc_centre_statistics(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Calculates within-groups statistics: mean distance from the city centre within cluster, number of points in cluster, dispersion of distance between dotes in a cluster, number of boards oriented north, number of boards oriented west, etc.
        :param df: pd.DataFrame with columns `lan`, `lon`, `cluster_1`, `cluster_2`, etc.
        :return: modified pd.DataFrame with added features
        """
        df_res = (
            df
            .assign(
                distance_msc_centre_mean=lambda df_: self._add_distances_function_column(
                    df_['points'], self.moscow_centre_coordinates, np.mean
                )
            )
            .assign(
                distance_msc_centre_median=lambda df_: self._add_distances_function_column(
                    df_['points'], self.moscow_centre_coordinates, np.median
                )
            )
            .assign(
                distance_msc_centre_std=lambda df_: self._add_distances_function_column(
                    df_['points'], self.moscow_centre_coordinates, np.std
                )
            )
            .assign(
                distance_msc_centre_mean_trim=lambda df_: self._add_distances_function_column(
                    df_['points'], self.moscow_centre_coordinates,
                    stats.trim_mean, proportiontocut=0.1
                )
            )
        )
        return df_res
    
    def preprocess(self, df, normalize: bool = True) -> pd.DataFrame:
        df_transformed = (
            pd.concat([df, pd.json_normalize(df['targetAudience'])], axis=1)
            # distance until Moscow city centre
            .pipe(self.msc_centre_statistics)
            # number of geo-points there are in  every cluster
            .pipe(self.apply_clusters_distribution)
            # number of geo-points
            .assign(num_points=lambda df_: df_['points'].apply(lambda l: len(l)))
            
            # social demographics features
            .assign(salary_a=lambda df_: df_['income'].apply(lambda x: 1 if 'a' in x else 0))
            .assign(salary_b=lambda df_: df_['income'].apply(lambda x: 1 if 'c' in x else 0))
            .assign(salary_c=lambda df_: df_['income'].apply(lambda x: 1 if 'b' in x else 0))
            .assign(male=lambda df_: df_['income'].apply(lambda x: 1 if 'c' in x else 0))
            .assign(female=lambda df_: df_['gender'].apply(lambda x: 1 if x in ['female', 'all'] else 0))
            
            .drop(
                columns=['hash', 'targetAudience', 'points', 'income', 'name', 'gender', 'id'],
                errors='ignore'
            )
            .fillna(0)
        )
        if normalize:
            scaler = StandardScaler()
            df_transformed = pd.DataFrame(
                scaler.fit_transform(df_transformed), columns=df_transformed.columns
            )
        return df_transformed

processor = DataPreprocessor(clusterizer=clusterizer)
X = processor.preprocess(df=df.drop(columns=['value']), normalize=False)
y = df['value']

2024-07-07 00:41:52,458 - __main__ - INFO - No stats gathered
2024-07-07 00:41:52,470 - __main__ - INFO - No stats gathered


In [11]:
X.head()

Unnamed: 0,ageFrom,ageTo,distance_msc_centre_mean,distance_msc_centre_median,distance_msc_centre_std,distance_msc_centre_mean_trim,cluster_0,cluster_1,cluster_2,cluster_3,...,cluster_9_c_std_lon,cluster_9_count,cluster_9_mean_lat,cluster_9_mean_lon,num_points,salary_a,salary_b,salary_c,male,female
0,25,45,0.207808,0.214798,0.029227,0.209562,0,0,0,0,...,0.0,1.0,55.626667,37.472993,25,0,1,1,1,1
1,30,60,0.134834,0.145424,0.0391,0.135624,0,0,0,0,...,0.0,1.0,55.624571,37.510975,24,1,1,1,1,1
2,30,60,0.122133,0.114161,0.044681,0.122618,0,0,0,0,...,0.0,1.0,55.624571,37.510975,34,1,1,1,1,1
3,30,100,0.134386,0.127367,0.061051,0.13163,0,0,0,0,...,0.034889,10.0,55.613936,37.485229,160,0,1,1,1,1
4,30,100,0.122318,0.110999,0.043082,0.122508,0,0,0,0,...,0.005376,2.0,55.619954,37.505598,44,0,1,1,1,1


# 2. Feature Selection

In [66]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [67]:
def get_column_indices(df: pd.DataFrame, column_names: list) -> list:
    return [df.columns.get_loc(c) for c in column_names if c in df.columns]

def get_data_feature_types(data, indices):
    df_feat_types = pd.DataFrame(
        data.iloc[:, indices].dtypes, columns=["dtype"]
    ).reset_index(names=["feature_name"])
    return df_feat_types

In [68]:
def get_features_importance_rand_feat(
        X_train, y_train, X_valid, y_valid, n_iterations=10
):
    # Initialize a dictionary to store accumulated feature importance
    accumulated_importance = {name: 0 for name in X_train.columns}
    accumulated_importance["random"] = 0

    for _ in tqdm(range(n_iterations)):
        # Add a random feature
        X_train["random"] = np.random.random(size=len(X_train))
        X_valid["random"] = np.random.random(size=len(X_valid))

        model = CatBoostRegressor(
            loss_function="RMSE",
            random_seed=42,
            logging_level="Silent",
            max_depth=6,
            iterations=200,
            early_stopping_rounds=20,
        )
        # Train the model
        model.fit(
            X_train,
            y_train,
            eval_set=(X_valid, y_valid),
        )

        # Get feature importance
        importance = model.feature_importances_

        # Accumulate feature importance
        for name, imp in zip(X_train.columns, importance):
            accumulated_importance[name] += imp

    # Average the feature importance
    features_importance = {
        name: imp / n_iterations for name, imp in accumulated_importance.items()
    }

    return features_importance


def get_random_feat_important_features(X_train, y_train, X_valid, y_valid):
    feat_importance = get_features_importance_rand_feat(
        X_train, y_train, X_valid, y_valid
    )
    feat_importance = (
        pd.DataFrame.from_records(
            [feat_importance],
        )
        .transpose()
        .rename(columns={0: "AVG_Importance"})
    )
    threshold = max(float(feat_importance[feat_importance.index == 'random']["AVG_Importance"].values), 0)
    useful_column_indices = get_column_indices(
        X_train,
        feat_importance.query("AVG_Importance > @threshold")["AVG_Importance"].index.to_list(),
    )
    feat_types = get_data_feature_types(X_train, useful_column_indices)
    logger.info(
        "From {orig_n} feature {tr_n} were selected ({left_perc:.2f}%)."
        "Share of 'Object' type features is: {obj_feat:.2f}%."
        "Set Threshold is {threshold}".format(
            orig_n=len(X_train.columns),
            tr_n=len(useful_column_indices),
            left_perc=len(useful_column_indices) / len(X_train.columns),
            obj_feat=len(feat_types.query("dtype.isin(['object'])")) / len(feat_types),
            threshold=threshold,
        ))
    return useful_column_indices

In [90]:
useful_column_indices = get_random_feat_important_features(X_train, y_train, X_test, y_test)
useful_column_names = X.iloc[:, useful_column_indices].columns
print(useful_column_names)

100%|██████████| 10/10 [00:05<00:00,  1.71it/s]
2024-07-07 00:57:41,651 - __main__ - INFO - From 237 feature 33 were selected (0.14%).Share of 'Object' type features is: 0.00%.Set Threshold is 0.9165358987047872


Index(['ageFrom', 'cluster_12', 'cluster_15', 'cluster_22', 'cluster_37',
       'cluster_38', 'cluster_10_c_mean_lon_trimmed', 'cluster_11_count',
       'cluster_16_c_std_lat', 'cluster_17_c_mean_lat_trimmed',
       'cluster_17_mean_lat', 'cluster_19_count', 'cluster_22_c_std_lon',
       'cluster_22_count', 'cluster_22_mean_lon', 'cluster_23_c_std_lat',
       'cluster_23_count', 'cluster_24_c_std_lon', 'cluster_24_mean_lon',
       'cluster_25_c_std_lat', 'cluster_25_mean_lat', 'cluster_25_mean_lon',
       'cluster_26_mean_lat', 'cluster_30_c_mean_lat_trimmed',
       'cluster_31_c_mean_lat_trimmed', 'cluster_31_mean_lat',
       'cluster_36_c_std_lat', 'cluster_37_mean_lat', 'cluster_38_c_std_lat',
       'cluster_39_c_std_lon', 'cluster_39_mean_lon', 'cluster_9_count',
       'num_points'],
      dtype='object')


# 3. Models Definition & Hyperparams Optimization

In [76]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingRegressor
import pytorch_lightning as pl
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import optuna

In [77]:
# Define a small neural network using PyTorch Lightning
import pytorch_lightning as pl
class SmallNN(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SmallNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(hidden_dim, output_dim),
        )
        self.criterion = nn.MSELoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

    def predict(self, X):
        self.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X.values, dtype=torch.float32)
            predictions = self(X_tensor).numpy()
        return predictions


def train_nn(X, y, input_dim, hidden_dim, output_dim, epochs=10):
    dataset = TensorDataset(torch.tensor(X.values, dtype=torch.float32), torch.tensor(y, dtype=torch.float32))
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    model = SmallNN(input_dim, hidden_dim, output_dim)
    trainer = pl.Trainer(max_epochs=epochs)
    trainer.fit(model, dataloader)
    return model


In [91]:
X_train, X_test, y_train, y_test = train_test_split(X[useful_column_names], y, test_size=0.2) #[X_selecteds.columns]
# X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.3)
X_train

Unnamed: 0,ageFrom,cluster_12,cluster_15,cluster_22,cluster_37,cluster_38,cluster_10_c_mean_lon_trimmed,cluster_11_count,cluster_16_c_std_lat,cluster_17_c_mean_lat_trimmed,...,cluster_30_c_mean_lat_trimmed,cluster_31_c_mean_lat_trimmed,cluster_31_mean_lat,cluster_36_c_std_lat,cluster_37_mean_lat,cluster_38_c_std_lat,cluster_39_c_std_lon,cluster_39_mean_lon,cluster_9_count,num_points
1497,18,0,0,0,0,0,0.000000e+00,0.0,0.000000,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,15
599,25,0,0,0,0,0,0.000000e+00,0.0,0.000000,-3.552714e-15,...,0.000000e+00,-4.736952e-15,55.802179,0.0,0.000000,0.000000,0.000000,0.000000,0.0,15
493,25,0,1,5,1,3,0.000000e+00,2.0,0.017381,-5.861197e-04,...,-1.839746e-03,0.000000e+00,55.810208,0.0,55.876289,0.006158,0.000000,0.000000,1.0,116
614,25,0,0,0,0,0,0.000000e+00,0.0,0.000000,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,6
1091,20,0,4,1,3,1,0.000000e+00,3.0,0.000000,0.000000e+00,...,0.000000e+00,-7.105427e-15,55.799372,0.0,55.888984,0.000000,0.000000,0.000000,0.0,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1021,18,0,0,2,0,2,-3.552714e-15,1.0,0.018675,6.634497e-04,...,-8.526513e-15,-2.103435e-03,55.818829,0.0,0.000000,0.004478,0.000000,37.673468,1.0,90
715,35,0,0,6,2,7,4.736952e-15,6.0,0.017563,6.653839e-04,...,-3.435940e-03,-9.391861e-04,55.819936,0.0,55.877439,0.009515,0.015099,37.689745,1.0,160
750,18,0,0,0,0,0,0.000000e+00,0.0,0.000000,0.000000e+00,...,0.000000e+00,0.000000e+00,55.792800,0.0,0.000000,0.000000,0.000000,0.000000,0.0,3
514,45,0,0,4,2,5,-3.552714e-15,1.0,0.018229,1.124216e-03,...,-1.396430e-03,-2.216758e-03,55.816645,0.0,55.875589,0.006827,0.020739,37.688601,1.0,170


In [92]:
# Define the objective functions for Optuna
def objective_lr(trial, X, y):
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    model = LinearRegression(fit_intercept=fit_intercept)
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    return score

def objective_catboost(trial, X, y):
    depth = trial.suggest_int('depth', 4, 6)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
    iterations = trial.suggest_int('iterations', 100, 200)
    model = CatBoostRegressor(depth=depth, learning_rate=learning_rate, iterations=iterations, verbose=0, loss_function='RMSE')
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1).mean()
    return score

def objective_knn(trial, X, y):
    n_neighbors = trial.suggest_int('n_neighbors', 6, 40)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    # metric = trial.suggest_categorical('metric', ['mahalanobis', 'minkowski'])
    model = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights)
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    return score

In [93]:
# Optimize hyperparameters using Optuna
study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(lambda trial: objective_lr(trial, X_selected, y), n_trials=50)

study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(lambda trial: objective_catboost(trial, X_selected, y), n_trials=50)

study_knn = optuna.create_study(direction='maximize')
study_knn.optimize(lambda trial: objective_knn(trial, X_selected, y), n_trials=50)

[I 2024-07-07 00:57:53,454] A new study created in memory with name: no-name-26119865-3aa6-44b8-94a9-40896e4ac39d
[I 2024-07-07 00:57:53,464] Trial 0 finished with value: -296.87938550380693 and parameters: {'fit_intercept': True}. Best is trial 0 with value: -296.87938550380693.
[I 2024-07-07 00:57:53,473] Trial 1 finished with value: -294.0015776756984 and parameters: {'fit_intercept': False}. Best is trial 1 with value: -294.0015776756984.
[I 2024-07-07 00:57:53,483] Trial 2 finished with value: -296.87938550380693 and parameters: {'fit_intercept': True}. Best is trial 1 with value: -294.0015776756984.
[I 2024-07-07 00:57:53,494] Trial 3 finished with value: -296.87938550380693 and parameters: {'fit_intercept': True}. Best is trial 1 with value: -294.0015776756984.
[I 2024-07-07 00:57:53,503] Trial 4 finished with value: -296.87938550380693 and parameters: {'fit_intercept': True}. Best is trial 1 with value: -294.0015776756984.
[I 2024-07-07 00:57:53,512] Trial 5 finished with value

# 4. Resulting Ensemble

In [94]:
# from scipy.linalg import inv
# 
# cov_matrix = np.cov(X_train.T)
# # Add a small value to the diagonal to regularize the covariance matrix
# regularization_term = 1e-5
# cov_matrix += np.eye(cov_matrix.shape[0]) * regularization_term
# # Compute the inverse of the regularized covariance matrix
# VI = inv(cov_matrix)
# 
# mahalanobis_knn = KNeighborsRegressor(
#     metric='mahalanobis', metric_params={'VI': VI}, n_neighbors=14
# )
# mahalanobis_knn.fit(X_train, y_train)
# y_pred = mahalanobis_knn.predict(X_test)

In [95]:
# Get the best models
best_lr = LinearRegression(**study_lr.best_params)
best_catboost = CatBoostRegressor(**study_catboost.best_params, verbose=0)
best_knn = KNeighborsRegressor(**study_knn.best_params)

# Train the neural network
# input_dim = X_train.shape[1]
# hidden_dim = 64
# output_dim = 1
# nn_model = train_nn(X_train, y_train, input_dim, hidden_dim, output_dim)

# Define the ensemble model
ensemble = VotingRegressor(
    estimators=[
        ('catboost', best_catboost),
        ('log_reg', best_lr),
        ('knn', best_knn)
    ], weights=[0.6, 0.2, 0.2]
)

In [96]:
# Fit the ensemble model
ensemble.fit(X_train, y_train)

# 5. Test Estimation

In [97]:
from sklearn.metrics import (
    mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error
)

In [98]:
# Predict with the ensemble model
best_catboost.fit(X_train, y_train)#, eval_set=(X_test, y_test)
best_lr.fit(X_train, y_train)
best_knn.fit(X_train, y_train)
# mahalanobis_knn.fit(X_train, y_train)
# ensemble
# nn_model
model = ensemble

In [99]:
y_pred = model.predict(X_test)

In [100]:
print('MAPE: ', mean_absolute_percentage_error(y_pred, y_test))
print('R2 Score: ', r2_score(y_pred, y_test))
print('MAE: ', mean_absolute_error(y_pred, y_test))
print('MSE: ', mean_squared_error(y_pred, y_test))

MAPE:  0.5902432498803922
R2 Score:  0.7188610520517098
MAE:  7.551164656174273
MSE:  121.01497272288186


# 6. Save Model

In [None]:
# joblib.dump(model, '../regressor_model.joblib')

## Example of what values are accepted by fastapi model

In [None]:

# Assuming df is your DataFrame
# Drop the 'value' column and select the first row
row_dict = df.drop(columns=['value']).iloc[0].to_dict()

# Convert the dictionary to a JSON string
row_json = pd.Series(row_dict).to_json()

# Read the JSON string into a DataFrame
result_df = pd.read_json(row_json, typ='series').to_frame().T

result_df

In [None]:
row_json