In [1]:
# !pip install

In [2]:
import warnings

warnings.filterwarnings("ignore")
import logging
# import pysnooper
from typing import List, Tuple
import numpy as np
import pandas as pd
from scipy import stats
import geopandas as gpd
from shapely.geometry import Point

from matplotlib import pyplot as plt
import seaborn as sns

import mlflow
import torch
import lightning as L
from torch import nn
from lightning.pytorch import seed_everything
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from torchmetrics import ExplainedVariance, MeanAbsoluteError, MeanSquaredError, MeanAbsolutePercentageError, R2Score
# from lightning.pytorch.loggers import MLFlowLogger

from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.backends else 'cpu')


# 1. Data Engineering

In [3]:
df = pd.read_json('train_data.json')
df

Unnamed: 0,hash,targetAudience,points,value
0,41567f28db47bee7,"{'name': 'All 25-45 BC', 'gender': 'all', 'age...","[{'lat': '55.573691', 'lon': '37.631423', 'azi...",23.51
1,94b6df335598a161,"{'name': 'W 30-60', 'gender': 'female', 'ageFr...","[{'lat': '55.656665886902', 'lon': '37.7408534...",1.20
2,2ef4e73f0d2c51d0,"{'name': 'W 30-60', 'gender': 'female', 'ageFr...","[{'lat': '55.680953807163', 'lon': '37.6644265...",4.65
3,b3ebb77f965de304,"{'name': 'W 30+ BC', 'gender': 'female', 'ageF...","[{'lat': '55.741072317672', 'lon': '37.6526972...",32.09
4,76059b6cbb303166,"{'name': 'W 30+ BC', 'gender': 'female', 'ageF...","[{'lat': '55.656665886902', 'lon': '37.7408534...",26.12
...,...,...,...,...
1542,75563c4cce5ca6c0,"{'name': 'All 30-55 BC', 'gender': 'all', 'age...","[{'lat': '55.961879', 'lon': '37.333988', 'azi...",20.55
1543,ed4353d7c33dd21c,"{'name': '35-100 C', 'gender': 'all', 'ageFrom...","[{'lat': '55.752846850282', 'lon': '37.5875834...",59.32
1544,77a82843b71a77ec,"{'name': '35-100 C', 'gender': 'all', 'ageFrom...","[{'lat': '55.752846850282', 'lon': '37.5875834...",41.30
1545,87b45be3fae91c39,"{'name': 'M 25-55 ', 'gender': 'male', 'ageFro...","[{'lat': '55.662899746091', 'lon': '37.4817413...",59.15


## Clasterizer

In [4]:
class Clusterizer:
    def __init__(self):
        self.config = {'min_xval': 55.55, 'max_xval': 55.95, 'min_yval': 37.3, 'max_yval': 37.9, 'x_ngroups': 3, 'y_ngroups': 3}
        # get x-axis grid
        self.x_intervals = self.split_on_intervals(
            self.config['min_xval'], self.config['max_xval'], self.config['x_ngroups']
        )
        # get y-axis grid
        self.y_intervals = self.split_on_intervals(
            self.config['min_yval'], self.config['max_yval'], self.config['y_ngroups']
        )
        # get 2-d grid
        self.groups = self.create_groups(self.x_intervals, self.y_intervals)
        self.n_groups = len(self.groups)
    
    def split_on_intervals(self, min_val, max_val, n):
        # Делит отрезок на равные интервалы
        step = (max_val - min_val) / n
        intervals = [min_val + (step * x) for x in range(n + 1)]
        return intervals
    
    @classmethod
    def create_groups(cls, x_intervals, y_intervals):
        #Создает регионы для поля
        groups = {}
        x_intervals = np.concatenate([[-np.inf], x_intervals, [np.inf]])
        y_intervals = np.concatenate([[-np.inf], y_intervals, [np.inf]])
    
        for x_i in range(len(x_intervals) - 1):
            for y_i in range(len(y_intervals) - 1):
                groups[
                    f'x : {x_intervals[x_i]} - {x_intervals[x_i + 1]} | y : {y_intervals[y_i]} - {y_intervals[y_i + 1]}'] = 0
    
        return groups
    
    @classmethod
    def sort_on_groups(cls, x_vals, y_vals, x_intervals, y_intervals, groups, only_vals=False):
        #Сортирует точки по регионам
        for x, y in zip(x_vals, y_vals):
            for x_i in range(len(x_intervals) - 1):
                for y_i in range(len(y_intervals) - 1):
                    if ((x_intervals[x_i] <= x < x_intervals[x_i + 1]) and (y_intervals[y_i] <= y < y_intervals[y_i + 1])):
                        groups[
                            f'x : {x_intervals[x_i]} - {x_intervals[x_i + 1]} | y : {y_intervals[y_i]} - {y_intervals[y_i + 1]}'] += 1
    
        if only_vals:
            return list(groups.values())
    
        return groups
    
    def assign_clusters(self, row):
        points = np.array([[float(x['lat']), float(x['lon'])] for x in row])
        group_values = self.sort_on_groups(
            points[:, 0], points[:, 1], self.x_intervals, self.y_intervals, self.groups.copy(), only_vals=True
        )
        
        return group_values

    def clusters_distribution(self, df: pd.DataFrame) -> pd.DataFrame:
        cluster_columns = [f'cluster_{i}' for i in range(self.n_groups)]
    
        df_clusters = df.apply(lambda row: self.assign_clusters(row['points']), axis=1)
        df_clusters = pd.DataFrame(df_clusters.tolist(), columns=cluster_columns)
    
        return pd.concat([df, df_clusters], axis=1)
        

In [5]:
def cclusters_distribution(clusterizer, df) -> pd.DataFrame:
    cluster_columns = [f'cluster_{i}' for i in range(clusterizer.n_groups)]

    df_clusters = df.apply(lambda row: clusterizer.assign_clusters(row['points']), axis=1)
    df_clusters = pd.DataFrame(df_clusters.tolist(), columns=cluster_columns)

    return pd.concat([df, df_clusters], axis=1)

In [6]:
# test
# Clusterizer().assign_clusters(df['points'].iloc[0])
clusterizer = Clusterizer()

In [7]:
cclusters_distribution(clusterizer, df)

Unnamed: 0,hash,targetAudience,points,value,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,...,cluster_15,cluster_16,cluster_17,cluster_18,cluster_19,cluster_20,cluster_21,cluster_22,cluster_23,cluster_24
0,41567f28db47bee7,"{'name': 'All 25-45 BC', 'gender': 'all', 'age...","[{'lat': '55.573691', 'lon': '37.631423', 'azi...",23.51,0,0,0,0,0,0,...,0,4,3,3,0,0,0,0,0,0
1,94b6df335598a161,"{'name': 'W 30-60', 'gender': 'female', 'ageFr...","[{'lat': '55.656665886902', 'lon': '37.7408534...",1.20,0,0,0,0,0,0,...,0,1,2,0,0,0,0,0,0,0
2,2ef4e73f0d2c51d0,"{'name': 'W 30-60', 'gender': 'female', 'ageFr...","[{'lat': '55.680953807163', 'lon': '37.6644265...",4.65,0,0,0,0,0,0,...,0,2,2,0,0,0,0,0,0,0
3,b3ebb77f965de304,"{'name': 'W 30+ BC', 'gender': 'female', 'ageF...","[{'lat': '55.741072317672', 'lon': '37.6526972...",32.09,0,0,0,0,0,0,...,0,10,10,0,0,0,0,0,0,0
4,76059b6cbb303166,"{'name': 'W 30+ BC', 'gender': 'female', 'ageF...","[{'lat': '55.656665886902', 'lon': '37.7408534...",26.12,0,0,0,0,0,0,...,0,2,3,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,75563c4cce5ca6c0,"{'name': 'All 30-55 BC', 'gender': 'all', 'age...","[{'lat': '55.961879', 'lon': '37.333988', 'azi...",20.55,0,0,0,0,0,0,...,0,5,2,1,0,0,0,0,0,0
1543,ed4353d7c33dd21c,"{'name': '35-100 C', 'gender': 'all', 'ageFrom...","[{'lat': '55.752846850282', 'lon': '37.5875834...",59.32,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1544,77a82843b71a77ec,"{'name': '35-100 C', 'gender': 'all', 'ageFrom...","[{'lat': '55.752846850282', 'lon': '37.5875834...",41.30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1545,87b45be3fae91c39,"{'name': 'M 25-55 ', 'gender': 'male', 'ageFro...","[{'lat': '55.662899746091', 'lon': '37.4817413...",59.15,0,0,0,0,0,0,...,0,4,4,0,0,0,0,0,0,0


## Data Preprocessor

In [30]:
class DataPreprocessor:
    def __init__(self, clusterizer, df: pd.DataFrame=None):
        self.df = df
        self.moscow_centre_coordinates = [55.751244, 37.618423]
        self.clusterizer = clusterizer
    
    @staticmethod
    def calculate_distances(row, centre_coordinates: List[float]) -> List[float]:
        distances = [
            float(gpd.GeoSeries(Point(float(point['lat']), float(point['lon']))) \
                  .distance(Point(centre_coordinates))) for point in row
        ]
        return distances
    
    @staticmethod
    def inner_cluster_stats(df: pd.DataFrame, cluster_centre=None) -> pd.DataFrame:
        """
        Accepts DataFrame with geo points and returns statistics of this cluster of points
        :param df: pd.DataFrame with two columns: `lat` and `lon`
        :param cluster_centre: centre of the geo cluster that allows for more precise feature engineering
        :return: pd.DataFrame with statistics
        """
        # Ensure the columns are of type float
        df['lat'] = df['lat'].astype(float)
        df['lon'] = df['lon'].astype(float)
        
        # Ensure we always have a cluster centre 
        if not cluster_centre:
            cluster_centre = (df['lat'].mean(), df['lon'].mean())
    
        # Calculate statistics
        cluster_stats = {
            'mean_lat': df['lat'].mean(),
            'mean_lon': df['lon'].mean(),
            'count': df.shape[0],
            # features relative to cluster centre
            'c_mean_lat_trimmed': np.mean(df['lat'] - cluster_centre[0], 0.1),
            'c_mean_lon_trimmed': np.std(df['lon'] - cluster_centre[1]),
            'c_std_lat_trimmed': np.std(df['lat'] - cluster_centre[0]),
            'c_std_lon_trimmed': np.std(df['lon'] - cluster_centre[1]),
            #todo: add Mahalanobis distance
        }
        return pd.DataFrame([cluster_stats])
    
    def _add_distances_function_column(
            self,
            points_col: pd.Series,
            centre_coordinates: List[float],
            func = np.mean,
            **kwargs
    ):
        x = points_col.apply(
            lambda row: func(self.calculate_distances(row, centre_coordinates), **kwargs)
        )
        return x

    def apply_clusters_distribution(self, df) -> pd.DataFrame:
        cluster_columns = [f'cluster_{i}' for i in range(self.clusterizer.n_groups)]
    
        df_clusters = df.apply(lambda row: self.clusterizer.assign_clusters(row['points']), axis=1)
        df_clusters = pd.DataFrame(df_clusters.tolist(), columns=cluster_columns)
    
        return pd.concat([df, df_clusters], axis=1)
    
    def msc_centre_statistics(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Calculates within-groups statistics: mean distance from the city centre within cluster, number of points in cluster, dispersion of distance between dotes in a cluster, number of boards oriented north, number of boards oriented west, etc.
        :param df: pd.DataFrame with columns `lan`, `lon`, `cluster_1`, `cluster_2`, etc.
        :return: modified pd.DataFrame with added features
        """
        df_res = (
            df
            .assign(
                distance_msc_centre_mean=lambda df_: self._add_distances_function_column(
                    df_['points'], self.moscow_centre_coordinates, np.mean
                )
            )
            .assign(
                distance_msc_centre_median=lambda df_: self._add_distances_function_column(
                    df_['points'], self.moscow_centre_coordinates, np.median
                )
            )
            .assign(
                distance_msc_centre_std=lambda df_: self._add_distances_function_column(
                    df_['points'], self.moscow_centre_coordinates, np.std
                )
            )
            .assign(
                distance_msc_centre_mean_trim=lambda df_: self._add_distances_function_column(
                    df_['points'], self.moscow_centre_coordinates,
                    stats.trim_mean, proportiontocut=0.1
                )
            )
        )
        return df_res
    
    def preprocess(self) -> pd.DataFrame:
        df_clusters = (
            pd.concat([self.df, pd.json_normalize(self.df['targetAudience'])], axis=1)
            .pipe(self.msc_centre_statistics)
            .pipe(self.apply_clusters_distribution)
            
            .assign(salary_a=lambda df_: df_['income'].apply(lambda x: 1 if 'a' in x else 0))
            .assign(salary_b=lambda df_: df_['income'].apply(lambda x: 1 if 'c' in x else 0))
            .assign(salary_c=lambda df_: df_['income'].apply(lambda x: 1 if 'b' in x else 0))
            .assign(male=lambda df_: df_['income'].apply(lambda x: 1 if 'c' in x else 0))
            .assign(female=lambda df_: df_['gender'].apply(lambda x: 1 if x in ['female', 'all'] else 0))
            .assign(num_points=lambda df_: df_['points'].apply(lambda l: len(l)))
            .drop(columns=['hash', 'targetAudience', 'points', 'income', 'name', 'gender', 'id'])
        )
        return df_clusters

processor = DataPreprocessor(clusterizer=clusterizer, df=df)
data = processor.preprocess()

In [31]:
data

Unnamed: 0,value,ageFrom,ageTo,distance_msc_centre_mean,distance_msc_centre_median,distance_msc_centre_std,distance_msc_centre_mean_trim,cluster_0,cluster_1,cluster_2,...,cluster_21,cluster_22,cluster_23,cluster_24,salary_a,salary_b,salary_c,male,female,num_points
0,23.51,25,45,0.207808,0.214798,0.029227,0.209562,0,0,0,...,0,0,0,0,0,1,1,1,1,25
1,1.20,30,60,0.134834,0.145424,0.039100,0.135624,0,0,0,...,0,0,0,0,1,1,1,1,1,24
2,4.65,30,60,0.122133,0.114161,0.044681,0.122618,0,0,0,...,0,0,0,0,1,1,1,1,1,34
3,32.09,30,100,0.134386,0.127367,0.061051,0.131630,0,0,0,...,0,0,0,0,0,1,1,1,1,160
4,26.12,30,100,0.122318,0.110999,0.043082,0.122508,0,0,0,...,0,0,0,0,0,1,1,1,1,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,20.55,30,55,0.241243,0.226587,0.057310,0.237754,0,0,0,...,0,0,0,0,0,1,1,1,1,27
1543,59.32,35,100,0.154092,0.109710,0.148934,0.123342,0,0,0,...,0,0,0,0,0,1,0,1,1,24
1544,41.30,35,100,0.195301,0.162747,0.171680,0.177899,0,0,0,...,0,0,0,0,0,1,0,1,1,15
1545,59.15,25,55,0.157045,0.167795,0.066993,0.159098,0,0,0,...,0,0,0,0,1,1,1,1,0,37


# 2. Feature Selection

In [32]:
X, y = data.drop(columns=['value']), data['value']

In [33]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

def feature_selection_with_boosting(X: pd.DataFrame, y: pd.Series, random_state: int = 42) -> pd.DataFrame:
    """
    Perform feature selection using CatBoost by comparing existing features to a random feature.
    Remove features that perform worse than the random feature.
    
    :param X: pd.DataFrame with features
    :param y: pd.Series with target variable
    :param random_state: int, random state for reproducibility
    :return: pd.DataFrame with selected features
    """
    # Add a random feature to the dataset
    np.random.seed(random_state)
    _X = X.copy()
    _X['random_feature'] = np.random.random(size=_X.shape[0])

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(_X, y, test_size=0.2, random_state=random_state)

    # Train a CatBoost model
    catboost = CatBoostRegressor(random_state=random_state, verbose=0)
    catboost.fit(X_train, y_train)

    # Get feature importances
    feature_importances = catboost.get_feature_importance()

    # Create a DataFrame for feature importances
    feature_importances_df = pd.DataFrame({
        'feature': _X.columns,
        'importance': feature_importances
    })

    # Get the importance of the random feature
    random_feature_importance = feature_importances_df[feature_importances_df['feature'] == 'random_feature']['importance'].values[0]

    # Select features that have higher importance than the random feature
    selected_features = feature_importances_df[feature_importances_df['importance'] > random_feature_importance]['feature'].tolist()
    
    # Remove the random feature from the selected features if it exists
    if 'random_feature' in selected_features:
        selected_features.remove('random_feature')

    # Return the DataFrame with selected features
    return _X[selected_features]

# Example usage
X_selected = feature_selection_with_boosting(X, y)
print(X_selected.columns)

Index(['ageFrom', 'ageTo', 'distance_msc_centre_median',
       'distance_msc_centre_std', 'distance_msc_centre_mean_trim', 'cluster_6',
       'cluster_7', 'cluster_8', 'cluster_11', 'cluster_12', 'cluster_13',
       'cluster_16', 'cluster_17', 'cluster_18', 'num_points'],
      dtype='object')


# 3. Models Definition & Hyperparams Optimization

In [34]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingRegressor
import pytorch_lightning as pl
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import optuna

In [35]:
# Define a small neural network using PyTorch Lightning
import pytorch_lightning as pl
class SmallNN(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SmallNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(hidden_dim, output_dim),
        )
        self.criterion = nn.MSELoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

    def predict(self, X):
        self.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X.values, dtype=torch.float32)
            predictions = self(X_tensor).numpy()
        return predictions


def train_nn(X, y, input_dim, hidden_dim, output_dim, epochs=10):
    dataset = TensorDataset(torch.tensor(X.values, dtype=torch.float32), torch.tensor(y, dtype=torch.float32))
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    model = SmallNN(input_dim, hidden_dim, output_dim)
    trainer = pl.Trainer(max_epochs=epochs)
    trainer.fit(model, dataloader)
    return model


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #[X_selecteds.columns]
# X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.3)
X_train

Unnamed: 0,ageFrom,ageTo,distance_msc_centre_mean,distance_msc_centre_median,distance_msc_centre_std,distance_msc_centre_mean_trim,cluster_0,cluster_1,cluster_2,cluster_3,...,cluster_21,cluster_22,cluster_23,cluster_24,salary_a,salary_b,salary_c,male,female,num_points
395,30,99,0.121042,0.117723,0.092874,0.113683,0,0,0,0,...,0,0,0,0,0,1,0,1,1,66
1246,13,25,0.128692,0.081467,0.108036,0.119197,0,0,0,0,...,0,0,0,0,1,1,1,1,1,114
787,18,45,0.203304,0.228690,0.088858,0.203543,0,0,0,0,...,0,0,0,0,1,1,1,1,1,19
1205,25,45,0.093301,0.066569,0.068060,0.086717,0,0,0,0,...,0,0,0,0,0,1,1,1,1,98
144,25,55,0.124705,0.125315,0.049312,0.124168,0,0,0,0,...,0,0,0,0,0,1,1,1,0,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558,25,45,0.114433,0.117147,0.052382,0.113043,0,0,0,0,...,0,0,0,0,0,1,1,1,1,62
1385,18,100,0.135400,0.120580,0.103208,0.122879,0,0,0,0,...,0,0,0,0,1,1,1,1,1,89
1399,25,50,0.120501,0.120303,0.055563,0.117683,0,0,0,0,...,0,0,0,0,0,1,1,1,1,170
1195,18,100,0.164128,0.148807,0.103414,0.148783,0,0,0,0,...,0,0,0,0,1,1,1,1,1,100


In [26]:
# Define the objective functions for Optuna
def objective_lr(trial, X, y):
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    model = LinearRegression(fit_intercept=fit_intercept)
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    return score

def objective_catboost(trial, X, y):
    depth = trial.suggest_int('depth', 4, 6)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
    iterations = trial.suggest_int('iterations', 100, 200)
    model = CatBoostRegressor(depth=depth, learning_rate=learning_rate, iterations=iterations, verbose=0, loss_function='RMSE')
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1).mean()
    return score

def objective_knn(trial, X, y):
    n_neighbors = trial.suggest_int('n_neighbors', 3, 7)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    model = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights)
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    return score

In [27]:
# Optimize hyperparameters using Optuna
study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(lambda trial: objective_lr(trial, X_selected, y), n_trials=50)

study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(lambda trial: objective_catboost(trial, X_selected, y), n_trials=50)

study_knn = optuna.create_study(direction='maximize')
study_knn.optimize(lambda trial: objective_knn(trial, X_selected, y), n_trials=50)

[I 2024-07-06 16:34:21,084] A new study created in memory with name: no-name-286bfb95-9cec-49e1-b9b3-fab8558ae48e
[I 2024-07-06 16:34:21,093] Trial 0 finished with value: -572.5785422926549 and parameters: {'fit_intercept': False}. Best is trial 0 with value: -572.5785422926549.
[I 2024-07-06 16:34:21,101] Trial 1 finished with value: -572.123459846555 and parameters: {'fit_intercept': True}. Best is trial 1 with value: -572.123459846555.
[I 2024-07-06 16:34:21,110] Trial 2 finished with value: -572.123459846555 and parameters: {'fit_intercept': True}. Best is trial 1 with value: -572.123459846555.
[I 2024-07-06 16:34:21,119] Trial 3 finished with value: -572.123459846555 and parameters: {'fit_intercept': True}. Best is trial 1 with value: -572.123459846555.
[I 2024-07-06 16:34:21,127] Trial 4 finished with value: -572.5785422926549 and parameters: {'fit_intercept': False}. Best is trial 1 with value: -572.123459846555.
[I 2024-07-06 16:34:21,135] Trial 5 finished with value: -572.1234

# 4. Resulting Ensemble

In [64]:
# Get the best models
best_lr = LinearRegression(**study_lr.best_params)
best_catboost = CatBoostRegressor(**study_catboost.best_params, verbose=0)
best_knn = KNeighborsRegressor(**study_knn.best_params)

# Train the neural network
input_dim = X_train.shape[1]
hidden_dim = 64
output_dim = 1
nn_model = train_nn(X_train, y_train, input_dim, hidden_dim, output_dim)

# Define the ensemble model
ensemble = VotingRegressor(estimators=[
    ('catboost', best_catboost),
    ('knn', best_knn)
], weights=[0.8, 0.2]
)

# Fit the ensemble model
ensemble.fit(X_train, y_train)

2024-07-06 16:40:18,607 - pytorch_lightning.utilities.rank_zero - INFO - GPU available: True (mps), used: True
2024-07-06 16:40:18,607 - pytorch_lightning.utilities.rank_zero - INFO - TPU available: False, using: 0 TPU cores
2024-07-06 16:40:18,608 - pytorch_lightning.utilities.rank_zero - INFO - IPU available: False, using: 0 IPUs
2024-07-06 16:40:18,608 - pytorch_lightning.utilities.rank_zero - INFO - HPU available: False, using: 0 HPUs
2024-07-06 16:40:18,612 - pytorch_lightning.callbacks.model_summary - INFO - 
  | Name      | Type       | Params
-----------------------------------------
0 | model     | Sequential | 2.5 K 
1 | criterion | MSELoss    | 0     
-----------------------------------------
2.5 K     Trainable params
0         Non-trainable params
2.5 K     Total params
0.010     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

2024-07-06 16:40:19,699 - pytorch_lightning.utilities.rank_zero - INFO - `Trainer.fit` stopped: `max_epochs=10` reached.


# 5. Test Estimation

In [65]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

In [66]:
# Predict with the ensemble model
best_catboost.fit(X_train, y_train)#, eval_set=(X_test, y_test)
# best_lr.fit(X_train, y_train)
best_knn.fit(X_train, y_train)
ensemble
# nn_model
y_pred = ensemble.predict(X_test)

In [67]:
print('MAPE: ', mean_absolute_percentage_error(y_pred, y_test))
print('R2 Score: ', r2_score(y_pred, y_test))
print('MAE: ', mean_absolute_error(y_pred, y_test))

MAPE:  0.8817380216652547
R2 Score:  0.7914984493560924
MAE:  6.1908584816597525
