Cell 1: Imports & Configuration
python


In [1]:
# pip install --upgrade scikit-learn
# pip install xgboost lightgbm
# !pip install catboost
# !pip install lightgbm --config-settings=cmake.define.USE_GPU=ON

In [2]:
# linux only
#!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com

In [3]:
from sklearn.exceptions import ConvergenceWarning
import warnings
import joblib
from datetime import datetime
import time
from sklearn.inspection import permutation_importance
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    AdaBoostClassifier,
)
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc, classification_report, RocCurveDisplay
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier

from joblib import Memory
from sklearn.inspection import permutation_importance

In [4]:
data_path = "C:\\Users\\jerdna\\Documents\\Thesis\\Datasets\\genre\\genre_music.csv\\genre_music.csv"
df_all = pd.read_csv(data_path)
df_all = df_all[df_all['tempo'] != 0]

features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
    'duration_s', 'time_signature', 'chorus_hit', 'sections'
]
target_col = 'popularity' 
keep_columns = features + [target_col, 'genre']

df_all['duration_ms'] = df_all['duration_s'] * 1000
features.remove('duration_s')
features.append('duration_ms') 
keep_columns.remove('duration_s')
keep_columns.append('duration_ms')

decade_dfs = {}
for decade in df_all['decade'].unique():
    decade_name = f"{decade}s" if isinstance(decade, int) else decade
    
    decade_dfs[decade_name] = df_all[df_all['decade'] == decade][keep_columns].copy()

print("Loaded decades:", sorted(decade_dfs.keys()))
for decade, df in decade_dfs.items():
    print(f"\n{decade}:")
    print(f"• Tracks: {len(df)}")
    print(f"• Features: {len(features)}")
    print(f"• Target range: {df[target_col].min()}-{df[target_col].max()}")
    print("• Genre distribution:")
    print(df['genre'].value_counts().head())

all_dfs = list(decade_dfs.values())
df_combined = pd.concat(all_dfs, ignore_index=True)
df_combined = df_combined[features + [target_col]].dropna()  
X_all = df_combined[features].values
y_all = df_combined[target_col].values

print("\nOverall combined dataset shape:", df_combined.shape)
print("Feature matrix shape:", X_all.shape)
print("Target range:", y_all.min(), "-", y_all.max())

Loaded decades: ['00s', '10s', '60s', '70s', '80s', '90s']

60s:
• Tracks: 8641
• Features: 15
• Target range: 0-1
• Genre distribution:
pop     5425
rock    1860
r&b      598
rap      275
edm      256
Name: genre, dtype: int64

70s:
• Tracks: 7764
• Features: 15
• Target range: 0-1
• Genre distribution:
pop      3712
r&b      1946
rock     1278
latin     332
edm       254
Name: genre, dtype: int64

80s:
• Tracks: 6907
• Features: 15
• Target range: 0-1
• Genre distribution:
r&b      2910
pop      2174
rock      933
latin     488
edm       239
Name: genre, dtype: int64

90s:
• Tracks: 5519
• Features: 15
• Target range: 0-1
• Genre distribution:
r&b      2262
pop      1444
rock      818
latin     384
rap       366
Name: genre, dtype: int64

00s:
• Tracks: 5871
• Features: 15
• Target range: 0-1
• Genre distribution:
r&b      1921
pop      1372
rock     1014
latin     734
rap       585
Name: genre, dtype: int64

10s:
• Tracks: 6396
• Features: 15
• Target range: 0-1
• Genre distribution

In [5]:
output_dir = "C:\\Users\\jerdna\\Documents\\Thesis\\Datasets\\decades\\"
os.makedirs(output_dir, exist_ok=True)
for decade, df in decade_dfs.items():
    clean_name = decade.replace("s", "")  
    df.to_csv(f"{output_dir}dataset-of-{clean_name}.csv", index=False)
df_combined.to_csv(f"{output_dir}dataset-of-all.csv", index=False)


In [6]:
plt.style.use('seaborn-whitegrid')
sns.set_palette("husl")
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16

output_dir = "C:\\Users\\jerdna\\Documents\\Thesis\\Figures\\Genre_Distributions\\"
os.makedirs(output_dir, exist_ok=True)

for decade, df in decade_dfs.items():
    plt.figure(figsize=(10, 6))
    genre_counts = df['genre'].value_counts().sort_values(ascending=True)
    
    ax = genre_counts.plot(kind='barh', 
                         color=sns.color_palette("viridis", len(genre_counts)))
    
    for i, v in enumerate(genre_counts):
        ax.text(v + 3, i, f"{v:,}", color='black', va='center')
    
    plt.title(f'Genre Distribution: {decade}', pad=20)
    plt.xlabel('Number of Tracks', labelpad=10)
    plt.ylabel('Genre', labelpad=10)
    plt.tight_layout()
    
    filename = f"{output_dir}genre_distribution_{decade}.png"
    plt.savefig(filename, dpi=300, bbox_inches='tight', format='png')
    plt.close()
    print(f"Saved: {filename}")

df_combined_with_genre = pd.concat(decade_dfs.values(), ignore_index=True)

plt.figure(figsize=(12, 8))
overall_genres = df_combined_with_genre['genre'].value_counts().sort_values(ascending=False)

ax = overall_genres.plot(kind='bar', 
                        color=sns.color_palette("rocket", len(overall_genres)))

for p in ax.patches:
    ax.annotate(f"{p.get_height():,}", 
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10),
                textcoords='offset points')

plt.title('Overall Genre Distribution', pad=20)
plt.ylabel('Number of Tracks', labelpad=10)
plt.xlabel('Genre', labelpad=10)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

filename = f"{output_dir}genre_distribution_overall.png"
plt.savefig(filename, dpi=300, bbox_inches='tight', format='png')
plt.close()
print(f"Saved: {filename}")

plt.figure(figsize=(14, 8))

genre_trends = pd.DataFrame()
for decade, df in decade_dfs.items():
    genre_trends[decade] = df['genre'].value_counts(normalize=True) * 100

genre_trends = genre_trends.fillna(0)
decade_order = ['60s', '70s', '80s', '90s', '00s', '10s']
genre_trends = genre_trends[decade_order]  

top_genres = df_combined_with_genre['genre'].value_counts().nlargest(10).index
ax = genre_trends.loc[top_genres].T.plot(kind='area', 
                                       alpha=0.8,
                                       stacked=True,
                                       colormap='Spectral')

plt.title('Genre Popularity Trends by Decade', pad=20)
plt.ylabel('Percentage of Tracks (%)', labelpad=10)
plt.xlabel('Decade', labelpad=10)

ax.set_xticks(range(len(decade_order)))
ax.set_xticklabels(decade_order)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Genre')
plt.grid(True, alpha=0.3)
plt.tight_layout()

filename = f"{output_dir}genre_trends_temporal.png"
plt.savefig(filename, dpi=300, bbox_inches='tight', format='png')
plt.close()
print(f"Saved: {filename}")

print("\nAll figures saved in:", output_dir)

  plt.style.use('seaborn-whitegrid')


Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Distributions\genre_distribution_60s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Distributions\genre_distribution_70s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Distributions\genre_distribution_80s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Distributions\genre_distribution_90s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Distributions\genre_distribution_00s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Distributions\genre_distribution_10s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Distributions\genre_distribution_overall.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Distributions\genre_trends_temporal.png

All figures saved in: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Distributions\


<Figure size 1400x800 with 0 Axes>

In [7]:
plt.style.use('seaborn-whitegrid')
sns.set_palette("husl")
plt.rcParams.update({
    'font.family': 'serif',
    'font.size': 12,
    'axes.labelsize': 14,
    'axes.titlesize': 16
})

balance_dir = "C:\\Users\\jerdna\\Documents\\Thesis\\Figures\\Popularity_Balance\\"
os.makedirs(balance_dir, exist_ok=True)

for decade, df in decade_dfs.items():
    plt.figure(figsize=(10, 6))
    
    ax = sns.countplot(x='popularity', data=df, 
                      color='skyblue', edgecolor='black')
    
    total = len(df)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width()/2., height + 0.01*total,
                f'{height/total:.1%}',
                ha='center', va='bottom')
    
    ax.set_title(f'Popularity Distribution: {decade}\n(Total Tracks: {total:,})')
    ax.set_xlabel('Popularity (0 = Low, 1 = High)')
    ax.set_ylabel('Count')
    ax.set_xticklabels(['Low (0)', 'High (1)'])
    
    plt.tight_layout()
    filename = f"{balance_dir}popularity_binary_{decade}.png"
    plt.savefig(filename, dpi=300, bbox_inches='tight', format='png')
    plt.close()
    print(f"Saved: {filename}")

plt.figure(figsize=(14, 8))

decade_list = []
popularity_props = []
for decade in ['60s', '70s', '80s', '90s', '00s', '10s']:
    if decade in decade_dfs:
        df = decade_dfs[decade]
        total = len(df)
        popular = df['popularity'].sum()
        decade_list.append(decade)
        popularity_props.append(popular/total)

ax = sns.barplot(x=decade_list, y=popularity_props, 
                order=['60s', '70s', '80s', '90s', '00s', '10s'],
                color='skyblue', edgecolor='black')

for i, prop in enumerate(popularity_props):
    ax.text(i, prop + 0.01, f'{prop:.1%}',
           ha='center', va='bottom', fontweight='bold')

plt.title('Proportion of Popular Tracks (Popularity=1) by Decade')
plt.xlabel('Decade')
plt.ylabel('Proportion of Popular Tracks')
plt.ylim(0, 1)
plt.tight_layout()

filename = f"{balance_dir}popularity_proportions.png"
plt.savefig(filename, dpi=300, bbox_inches='tight', format='png')
plt.close()
print(f"Saved: {filename}")

balance_metrics = pd.DataFrame()
for decade, df in decade_dfs.items():
    balance_metrics.loc[decade, 'Total Tracks'] = len(df)
    balance_metrics.loc[decade, 'Popular Tracks'] = df['popularity'].sum()
    balance_metrics.loc[decade, 'Proportion Popular'] = df['popularity'].mean()
    balance_metrics.loc[decade, 'Std Dev'] = df['popularity'].std()

balance_metrics['Proportion Popular'] = balance_metrics['Proportion Popular'].apply(lambda x: f"{x:.1%}")
balance_metrics['Std Dev'] = balance_metrics['Std Dev'].apply(lambda x: f"{x:.3f}")

balance_metrics.to_csv(f"{balance_dir}popularity_binary_metrics.csv")
print(f"\nSaved binary popularity metrics table to: {balance_dir}popularity_binary_metrics.csv")

print("\nAll binary popularity figures saved in:", balance_dir)

  plt.style.use('seaborn-whitegrid')


Saved: C:\Users\jerdna\Documents\Thesis\Figures\Popularity_Balance\popularity_binary_60s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Popularity_Balance\popularity_binary_70s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Popularity_Balance\popularity_binary_80s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Popularity_Balance\popularity_binary_90s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Popularity_Balance\popularity_binary_00s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Popularity_Balance\popularity_binary_10s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Popularity_Balance\popularity_proportions.png

Saved binary popularity metrics table to: C:\Users\jerdna\Documents\Thesis\Figures\Popularity_Balance\popularity_binary_metrics.csv

All binary popularity figures saved in: C:\Users\jerdna\Documents\Thesis\Figures\Popularity_Balance\


In [8]:
plt.style.use('seaborn-whitegrid')
sns.set_palette("husl")
plt.rcParams.update({
    'font.family': 'serif',
    'font.size': 12,
    'axes.labelsize': 14,
    'axes.titlesize': 16
})

genre_balance_dir = "C:\\Users\\jerdna\\Documents\\Thesis\\Figures\\Genre_Popularity_Balance\\"
os.makedirs(genre_balance_dir, exist_ok=True)

def prepare_genre_plots_binary(df, decade_name="Overall", top_genres=None):
    if top_genres is None:
        top_genres = df['genre'].value_counts().nlargest(15).index

    genre_rates = df[df['genre'].isin(top_genres)].groupby('genre')['popularity'].mean().sort_values()

    plt.figure(figsize=(14, 8))

    ax = genre_rates.plot(kind='barh', color='skyblue', edgecolor='black')

    for i, v in enumerate(genre_rates):
        ax.text(v + 0.02, i - 0.15, f"{v:.1%}", color='black', fontsize=10)

    plt.title(f'Genre Hit Rates: {decade_name}\n(Showing Top {len(top_genres)} Genres)')
    plt.xlabel('Percentage of Hit Songs (Popularity = 1)')
    plt.ylabel('Genre')
    plt.xlim(0, 1)
    plt.grid(True, axis='x', alpha=0.3)
    plt.tight_layout()

    filename = f"{genre_balance_dir}genre_hit_rates_{decade_name.lower()}.png"
    plt.savefig(filename, dpi=300, bbox_inches='tight', format='png')
    plt.close()
    print(f"Saved: {filename}")

df_combined_with_genre = pd.concat([df for df in decade_dfs.values()], ignore_index=True)

top_overall_genres = df_combined_with_genre['genre'].value_counts().nlargest(15).index.tolist()

for decade in ['60s', '70s', '80s', '90s', '00s', '10s']:
    if decade in decade_dfs:
        prepare_genre_plots_binary(decade_dfs[decade], decade_name=decade, top_genres=top_overall_genres)

prepare_genre_plots_binary(df_combined_with_genre, decade_name="Overall", top_genres=top_overall_genres)

genre_stats = pd.DataFrame()

for decade in ['60s', '70s', '80s', '90s', '00s', '10s']:
    if decade in decade_dfs:
        stats = decade_dfs[decade].groupby('genre').agg(
            hit_rate=('popularity', 'mean'),
            total_songs=('popularity', 'count'),
            hits=('popularity', 'sum')
        ).reset_index()
        stats['decade'] = decade
        genre_stats = pd.concat([genre_stats, stats])

overall_stats = df_combined_with_genre.groupby('genre').agg(
    hit_rate=('popularity', 'mean'),
    total_songs=('popularity', 'count'),
    hits=('popularity', 'sum')
).reset_index()
overall_stats['decade'] = 'Overall'
genre_stats = pd.concat([genre_stats, overall_stats])

pivot_stats = genre_stats.pivot_table(
    index='genre',
    columns='decade',
    values=['hit_rate', 'total_songs', 'hits'],
    aggfunc='first'
)

pivot_stats.to_csv(f"{genre_balance_dir}genre_hit_rate_stats.csv")
pivot_stats.to_excel(f"{genre_balance_dir}genre_hit_rate_stats.xlsx")

print(f"\nSaved statistics to:\n{genre_balance_dir}genre_hit_rate_stats.[csv/xlsx]")
print("\nAll genre hit rate figures saved in:", genre_balance_dir)


  plt.style.use('seaborn-whitegrid')


Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\genre_hit_rates_60s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\genre_hit_rates_70s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\genre_hit_rates_80s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\genre_hit_rates_90s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\genre_hit_rates_00s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\genre_hit_rates_10s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\genre_hit_rates_overall.png

Saved statistics to:
C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\genre_hit_rate_stats.[csv/xlsx]

All genre hit rate figures saved in: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\


In [9]:
def plot_feature_by_genre(df, decade_name, feature, top_genres):
    feature_dir = os.path.join(genre_balance_dir, f"{feature}_by_genre")
    os.makedirs(feature_dir, exist_ok=True)

    df_filtered = df[df['genre'].isin(top_genres)]

    feature_stats = df_filtered.groupby('genre')[feature].mean().sort_values()

    plt.figure(figsize=(14, 8))
    ax = feature_stats.plot(kind='barh', color='mediumseagreen', edgecolor='black')

    for i, v in enumerate(feature_stats):
        ax.text(v + 1, i - 0.15, f"{v:.1f}", color='black', fontsize=10)

    plt.title(f"{feature.replace('_', ' ').title()} by Genre: {decade_name}")
    plt.xlabel(f"Mean {feature.replace('_', ' ').title()}")
    plt.ylabel("Genre")
    plt.grid(True, axis='x', alpha=0.3)
    plt.tight_layout()

    filename = f"{feature_dir}/{feature}_{decade_name.lower()}.png"
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved: {filename}")

for feature in ['tempo', 'duration_ms']:
    for decade in ['60s', '70s', '80s', '90s', '00s', '10s']:
        if decade in decade_dfs:
            plot_feature_by_genre(decade_dfs[decade], decade_name=decade, feature=feature, top_genres=top_overall_genres)

    plot_feature_by_genre(df_combined_with_genre, decade_name='Overall', feature=feature, top_genres=top_overall_genres)


Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\tempo_by_genre/tempo_60s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\tempo_by_genre/tempo_70s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\tempo_by_genre/tempo_80s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\tempo_by_genre/tempo_90s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\tempo_by_genre/tempo_00s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\tempo_by_genre/tempo_10s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\tempo_by_genre/tempo_overall.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\duration_ms_by_genre/duration_ms_60s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_Popularity_Balance\duration_ms_by_genre/duration_ms_70s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Genre_

In [10]:
tempo_dur_dir = "C:\\Users\\jerdna\\Documents\\Thesis\\Figures\\Box_Plots\\"
os.makedirs(tempo_dur_dir, exist_ok=True)
def plot_feature_boxplot(df, feature, decade_name):
    plt.figure(figsize=(8, 5))
    sns.boxplot(y=df[feature], color="skyblue")

    plt.title(f"{feature.replace('_', ' ').title()} Boxplot - {decade_name}")
    plt.ylabel(feature.replace('_', ' ').title())
    plt.xlabel("") 
    plt.tight_layout()

    filename = os.path.join(tempo_dur_dir, f"{feature}_boxplot_{decade_name.lower()}.png")
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved: {filename}")
def plot_feature_distribution(df, feature, decade_name):
    plt.figure(figsize=(10, 6))
    sns.histplot(df[feature], kde=True, bins=50, color="steelblue", edgecolor="black")

    plt.title(f"{feature.replace('_', ' ').title()} Distribution - {decade_name}")
    plt.xlabel(feature.replace('_', ' ').title())
    plt.ylabel("Count")
    plt.tight_layout()

    filename = os.path.join(tempo_dur_dir, f"{feature}_distribution_{decade_name.lower()}.png")
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved: {filename}")

for feature in ['tempo', 'duration_ms','mode', 'time_signature', 'key','valence', 'speechiness', 'liveness', 'instrumentalness', 'acousticness', 'danceability', 'energy']:
    for decade in ['60s', '70s', '80s', '90s', '00s', '10s']:
        if decade in decade_dfs:
            df = decade_dfs[decade]
            if feature in df.columns:
#                 plot_feature_distribution(df, feature, decade)
                plot_feature_boxplot(df, feature, decade)


df_all = pd.concat([df for df in decade_dfs.values()], ignore_index=True)
for feature in ['tempo', 'duration_ms','mode', 'time_signature', 'key','valence', 'speechiness', 'liveness', 'instrumentalness', 'acousticness', 'danceability', 'energy']:
    if feature in df_all.columns:
#         plot_feature_distribution(df_all, feature, "Overall")
        plot_feature_boxplot(df_all, feature, "Overall")


Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\tempo_boxplot_60s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\tempo_boxplot_70s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\tempo_boxplot_80s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\tempo_boxplot_90s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\tempo_boxplot_00s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\tempo_boxplot_10s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\duration_ms_boxplot_60s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\duration_ms_boxplot_70s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\duration_ms_boxplot_80s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\duration_ms_boxplot_90s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\duration_ms_boxplot_00s.png
Saved: C:\Users\jerdna\Documents\Thesis\Figures\Box_Plots\duration_ms_boxplot_10s.png
Save

In [11]:
df = pd.read_csv(r"C:\Users\jerdna\Documents\Thesis\Datasets\spotify_data_mil.csv")

tempo_bounds = (df['tempo'].min(), df['tempo'].max())
duration_bounds = (df['duration_ms'].min(), df['duration_ms'].max())

print(f"Tempo bounds (BPM): {tempo_bounds}")
print(f"Duration bounds (ms): {duration_bounds}")


tempo_bounds_ = (df_combined['tempo'].min(), df_combined['tempo'].max())
duration_bounds_ = (df_combined['duration_ms'].min(), df_combined['duration_ms'].max())

print(f"Tempo bounds (BPM): {tempo_bounds_}")
print(f"Duration bounds (ms): {duration_bounds_}")

Tempo bounds (BPM): (0.0, 249.993)
Duration bounds (ms): (2073, 6000495)
Tempo bounds (BPM): (31.988, 241.423)
Duration bounds (ms): (15168.0, 4170227.0)


In [12]:
tempo_sorted = df_combined[df_combined['tempo'] > 0]['tempo'].sort_values()

print(tempo_sorted.head(20))

3429     31.988
3465     32.435
5479     34.333
239      34.496
24520    34.535
13608    35.732
2414     36.520
10837    37.114
22755    39.002
35488    39.369
4215     39.823
36197    40.645
1798     45.053
2644     45.363
11140    46.074
23923    46.185
982      46.315
21880    46.496
12525    46.716
11693    46.744
Name: tempo, dtype: float64


In [13]:
print(f"Unique values in 'key':", df_combined['key'].unique())
print(f"Data type of 'key':", df_combined['key'].dtype)
print(list(range(12)))

Unique values in 'key': [ 3  5  7 11  0  2 10  9  4  6  1  8]
Data type of 'key': int64
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [14]:
manual_categories = [
    [0, 1],                      # mode: binary
    [0, 1, 2, 3, 4, 5, 6, 7],    # time_signature: 0–7
    list(range(12))             # key: 0–11
]
nominal_features = ['mode', 'time_signature', 'key']
zero_one_features = ['valence', 'speechiness', 'liveness', 'instrumentalness', 'acousticness', 'danceability', 'energy']
large_scale_features = ['duration_ms', 'tempo']

def custom_minmax_transform(X):
    tempo_min, tempo_max = 30.0, 250.0
    duration_min, duration_max = 2073, 6000495

    X = np.asarray(X)  
    X_scaled = X.copy()

    X_scaled[:, 0] = (X_scaled[:, 0] - duration_min) / (duration_max - duration_min)
    X_scaled[:, 1] = (X_scaled[:, 1] - tempo_min) / (tempo_max - tempo_min)

    return np.clip(X_scaled, 0, 1)


from sklearn.preprocessing import FunctionTransformer
custom_scaler = FunctionTransformer(custom_minmax_transform, feature_names_out='one-to-one')

main_preprocessor = ColumnTransformer([
    ('scale', custom_scaler, large_scale_features),
    ('passthrough', 'passthrough', zero_one_features),
    ('nominal', OneHotEncoder(categories=manual_categories, sparse_output=False), nominal_features)
])

def preprocess_and_save(df, name, preprocessor, output_dir, features, target_col):
    os.makedirs(output_dir, exist_ok=True)
    X = df[features]
    y = df[target_col]
    X_transformed = preprocessor.fit_transform(X)
    joblib.dump((X_transformed, y), os.path.join(output_dir, f"{name}.pkl"))
    print(f"Saved: {name}.pkl")

preprocessors = {
    "main": main_preprocessor,
}
preproc_output_dir = "C:\\Users\\jerdna\\Documents\\Thesis\\Datasets\\Preprocessed"

for decade, df in decade_dfs.items():
    for key, preproc in preprocessors.items():
        preprocess_and_save(df, f"{key}_{decade}", preproc, preproc_output_dir, features, target_col)

for key, preproc in preprocessors.items():
    preprocess_and_save(df_combined, f"{key}_all", preproc, preproc_output_dir, features, target_col)

Saved: main_60s.pkl
Saved: main_70s.pkl
Saved: main_80s.pkl
Saved: main_90s.pkl
Saved: main_00s.pkl
Saved: main_10s.pkl
Saved: main_all.pkl


In [15]:
preproc_output_dir = "C:\\Users\\jerdna\\Documents\\Thesis\\Datasets\\Preprocessed"

for filename in sorted(os.listdir(preproc_output_dir)):
    if filename.endswith(".pkl"):
        path = os.path.join(preproc_output_dir, filename)
        try:
            X, y = joblib.load(path)

            if isinstance(X, np.ndarray):
                X = pd.DataFrame(X)

            print(f"\n📂 File: {filename}")
            print(f"   🔢 Shape: {X.shape}")
            print(f"   🏷️  Columns: {list(X.columns)}")

            if X.isnull().values.any():
                print("   ⚠️  NaNs found in the following columns:")
                print(X.isnull().sum()[X.isnull().sum() > 0])
            else:
                print("   ✅ No NaN values found.")
        
        except Exception as e:
            print(f"❌ Error loading {filename}: {e}")



📂 File: boost_00s.pkl
   🔢 Shape: (5871, 31)
   🏷️  Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
   ✅ No NaN values found.

📂 File: boost_10s.pkl
   🔢 Shape: (6396, 31)
   🏷️  Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
   ✅ No NaN values found.

📂 File: boost_60s.pkl
   🔢 Shape: (8642, 31)
   🏷️  Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
   ✅ No NaN values found.

📂 File: boost_70s.pkl
   🔢 Shape: (7764, 31)
   🏷️  Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
   ✅ No NaN values found.

📂 File: boost_80s.pkl
   🔢 Shape: (6907, 31)
   🏷️  Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
   ✅ No NaN

   ✅ No NaN values found.


In [16]:
from sklearn.linear_model import LogisticRegression

common_boosting_params = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.1, 0.05],
    'model__max_depth': [3, 5]
}

model_grids = {
    
    "LogisticRegression": (
        Pipeline([('model', LogisticRegression(
            solver='liblinear',
            max_iter=1000,
            random_state=42
        ))]),
        {
            'model__C': [0.1, 1, 10],
            'model__penalty': ['l1', 'l2']
        }
    ),
    # Naive Bayes
    "GaussianNB": (
        Pipeline([('model', GaussianNB())]),
        {}
    ),

    # Linear models
    "SGD": (
        Pipeline([('model', SGDClassifier(max_iter=1000, tol=1e-3, random_state=42))]),
        {
            'model__alpha': [1e-4, 1e-3, 1e-2],
            'model__penalty': ['l2', 'l1']
        }
    ),
    "LinearSVC": (
        Pipeline([('model', LinearSVC(max_iter=5000, random_state=42))]),
        {
            'model__C': [0.1, 1, 10]
        }
    ),
    "SVC": (
        Pipeline([('model', SVC(kernel='rbf', probability=True, random_state=42, cache_size=1000))]),
        {
            'model__C': [0.1, 1],
            'model__gamma': ['scale']
        }
    ),

    # Simple trees and bagging
    "DecisionTree": (
        Pipeline([('model', DecisionTreeClassifier(random_state=42))]),
        {
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        }
    ),
    "RandomForest": (
        Pipeline([('model', RandomForestClassifier(random_state=42))]),
        {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        }
    ),
    "ExtraTrees": (
        Pipeline([('model', ExtraTreesClassifier(random_state=42))]),
        {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        }
    ),

    # Boosting models
    "AdaBoost": (
        Pipeline([('model', AdaBoostClassifier(random_state=42))]),
        {
            'model__n_estimators': [100, 200],
            'model__learning_rate': [1.0, 0.5]
        }
    ),
    "GradientBoosting": (
        Pipeline([('model', GradientBoostingClassifier(random_state=42))]),
        common_boosting_params
    ),
    "XGBoost": (
        Pipeline([('model', XGBClassifier(
            random_state=42,
            n_jobs=-1,
            eval_metric='logloss'
        ))]),
        common_boosting_params
    ),
    "HistGradientBoosting": (
    Pipeline([
        ('model', HistGradientBoostingClassifier(random_state=42))
    ]),
    {
        'model__learning_rate': [0.1, 0.05],
        'model__max_iter': [100, 200],      
        'model__max_depth': [None, 10]
    }
)
,
    "LightGBM": (
        Pipeline([('model', LGBMClassifier(
            random_state=42,
            verbose=-1,
            n_jobs=-1
        ))]),
        {
            **common_boosting_params,
            'model__num_leaves': [31, 63],
            'model__min_child_samples': [20, 40]
        }
    ),
    "CatBoost": (
        Pipeline([('model', CatBoostClassifier(
            cat_features=nominal_features,
            random_seed=42,
            verbose=0
        ))]),
        {
            'model__iterations': [100, 200],
            'model__learning_rate': common_boosting_params['model__learning_rate'],
            'model__depth': common_boosting_params['model__max_depth']
        }
    )
}

In [17]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6396 entries, 34703 to 41098
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      6396 non-null   float64
 1   energy            6396 non-null   float64
 2   key               6396 non-null   int64  
 3   loudness          6396 non-null   float64
 4   mode              6396 non-null   int64  
 5   speechiness       6396 non-null   float64
 6   acousticness      6396 non-null   float64
 7   instrumentalness  6396 non-null   float64
 8   liveness          6396 non-null   float64
 9   valence           6396 non-null   float64
 10  tempo             6396 non-null   float64
 11  time_signature    6396 non-null   int64  
 12  chorus_hit        6396 non-null   float64
 13  sections          6396 non-null   int64  
 14  popularity        6396 non-null   int64  
 15  genre             6396 non-null   object 
 16  duration_ms       6396 non-null   flo

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,chorus_hit,sections,popularity,genre,duration_ms
34703,0.741,0.626,1,-4.826,0,0.0886,0.02,0.0,0.0828,0.706,108.029,4,41.18681,10,1,pop,188493.0
34704,0.447,0.247,5,-14.661,0,0.0346,0.871,0.814,0.0946,0.25,155.489,3,33.18083,9,0,pop,176880.0
34705,0.55,0.415,9,-6.557,0,0.052,0.161,0.0,0.108,0.274,172.065,4,44.89147,9,1,edm,205463.0
34706,0.502,0.648,0,-5.698,0,0.0527,0.00513,0.0,0.204,0.291,91.837,4,29.52521,7,0,pop,193043.0
34707,0.807,0.887,1,-3.892,1,0.275,0.00381,0.0,0.391,0.78,160.517,4,24.99199,8,1,rap,144244.0


Cell 4: Storage & Helper Functions

In [18]:
results = {}
cv_results = {}

def tune_and_evaluate_split_from_file(model_name, model, param_grid, data_dir, decade, cv_splits=5):

    filename = "main_" + decade if model_name != "CatBoost" else "raw_" + decade
    data_path = f"{data_dir}\\{filename}.pkl"
    X, y = joblib.load(data_path)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )

    skf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=skf,
        scoring="f1",
        n_jobs=-1,
        verbose=0,
        error_score='raise'
    )

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        gs.fit(X_train, y_train)

    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)

    try:
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
    except ValueError:
        acc = 0.0
        f1 = 0.0

    return gs.best_params_, acc, f1


def tune_full_cv_from_file(model_name, model, param_grid, data_dir, decade, cv_splits=5):

    filename = "main_" + decade if model_name != "CatBoost" else "raw_" + decade
    data_path = f"{data_dir}\\{filename}.pkl"
    X, y = joblib.load(data_path)

    skf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=skf,
        scoring="f1",
        n_jobs=-1,
        verbose=0
    )

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        gs.fit(X, y)

    return gs.best_params_, gs.best_score_


def tune_and_evaluate_split(X_tr, y_tr, X_te, y_te,
                            model_pipeline, param_grid,
                            cv_splits=5, return_model=False):
    gs = GridSearchCV(
        model_pipeline, param_grid,
        cv=cv_splits, scoring="f1", n_jobs=-1, error_score="raise"
    )
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        gs.fit(X_tr, y_tr)

    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_te)
    try:
        y_proba = best_model.predict_proba(X_te)[:, 1]
    except Exception:
        y_proba = None

    acc = accuracy_score(y_te, y_pred)
    f1 = f1_score(y_te, y_pred)

    if return_model:
        return gs.best_params_, acc, f1, best_model, y_pred, y_proba
    return gs.best_params_, acc, f1
from sklearn.model_selection import StratifiedKFold
def tune_full_cv(X, y, model_pipeline, param_grid,
                 cv_splits=5, return_preds=False, return_model=False):
    y = np.array(y)
    gs = GridSearchCV(
        model_pipeline, param_grid,
        cv=cv_splits, scoring="f1", n_jobs=-1
    )
    gs.fit(X, y)
    best_model = gs.best_estimator_

    skf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    f1s, accs = [], []
    y_all, y_pred_all, y_proba_all = [], [], []

    for tr_idx, te_idx in skf.split(X, y):
        X_tr, X_te = (X.iloc[tr_idx], X.iloc[te_idx]) if isinstance(X, pd.DataFrame) else (X[tr_idx], X[te_idx])
        y_tr, y_te = y[tr_idx], y[te_idx]

        best_model.fit(X_tr, y_tr)
        y_pred = best_model.predict(X_te)

        try:
            y_proba = best_model.predict_proba(X_te)[:, 1]
        except Exception:
            y_proba = None

        f1s.append(f1_score(y_te, y_pred))
        accs.append(accuracy_score(y_te, y_pred))

        if return_preds:
            y_all.extend(y_te)
            y_pred_all.extend(y_pred)
            y_proba_all.extend(y_proba if y_proba is not None else [0] * len(y_te))

    result = [
        gs.best_params_,
        np.mean(f1s),
        np.mean(accs)
    ]

    if return_preds:
        result.extend([
            np.array(y_all),
            np.array(y_pred_all),
            np.array(y_proba_all) if y_proba_all else None
        ])
    
    if return_model:
        result.append(best_model)

    return tuple(result)




In [19]:

def save_evaluation_outputs(model_name, decade_name, variant_label, eval_type, y_true, y_pred, y_proba):
    """
    Save classification report and ROC curve for a given evaluation.
    """
    base_dir = f"outputs/{model_name}/{decade_name}/{variant_label}/{eval_type}"
    os.makedirs(base_dir, exist_ok=True)

    report_path = os.path.join(base_dir, "classification_report.txt")
    with open(report_path, "w") as f:
        f.write(classification_report(y_true, y_pred))

    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y_true, y_proba)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, name=model_name).plot()
        plt.title(f"ROC Curve ({eval_type})")
        plt.savefig(os.path.join(base_dir, "roc_curve.png"))
        plt.close()

In [20]:
import os
import json
import joblib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    classification_report,
    roc_curve,
    auc,
    RocCurveDisplay,
    precision_score,
    recall_score,
    accuracy_score,
    f1_score
)

def save_full_checkpoint(
    model_name: str,
    decade_name: str,
    variant_label: str,
    eval_type: str,
    model,
    y_true,
    y_pred,
    y_proba,
    best_params: dict
):
    """
    Save model, params, metrics, ROC curve, and classification report for a given evaluation.
    """
    base_dir = f"outputs/{model_name}/{decade_name}/{variant_label}/{eval_type}"
    os.makedirs(base_dir, exist_ok=True)

    # 1. Save model
    model_path = os.path.join(base_dir, "model.joblib")
    joblib.dump(model, model_path)

    # 2. Save best params
    with open(os.path.join(base_dir, "best_params.json"), "w") as f:
        json.dump(best_params, f, indent=4)

    # 3. Calculate metrics
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred)
    }

    # 4. Save metrics
    with open(os.path.join(base_dir, "metrics.json"), "w") as f:
        json.dump(metrics, f, indent=4)

    # 5. Save classification report
    report = classification_report(y_true, y_pred)
    with open(os.path.join(base_dir, "classification_report.txt"), "w") as f:
        f.write(report)

    # 6. Save ROC curve (if available)
    if y_proba is not None and len(np.unique(y_true)) == 2:
        fpr, tpr, _ = roc_curve(y_true, y_proba)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, name=model_name).plot()
        plt.title(f"ROC Curve ({eval_type})")
        plt.savefig(os.path.join(base_dir, "roc_curve.png"))
        plt.close()
def save_pickle_checkpoint(path, data):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'wb') as f:
        joblib.dump(data, f)

def load_checkpoint(path):
    with open(path, 'rb') as f:
        return joblib.load(f)


In [33]:
import os
import time
from datetime import datetime

results = {}
cv_results = {}

for model_name, (model_pipeline, param_grid) in model_grids.items():
    print(f"\n{'='*60}")
    print(f"🚀 MODEL: {model_name}")
    model_start = time.time()

    results[model_name] = {}
    cv_results[model_name] = {}

    # Handle both decades and overall
    for scope_name, df in list(decade_dfs.items()) + [("All", df_combined)]:
        print(f"\n📀 Scope: {scope_name}")
        scope_name = str(scope_name)  # 🔧 force string
        results[model_name][scope_name] = {}
        cv_results[model_name][scope_name] = {}


        # Load data
        if model_name == "CatBoost":
            df_proc = df[features + [target_col]].dropna()
            X_pre, y_pre = df_proc[features], df_proc[target_col].values
        else:
            dataset_name = "all" if scope_name == "All" else scope_name
            dataset_path = os.path.join(preproc_output_dir, f"main_{dataset_name}.pkl")
            X_pre, y_pre = joblib.load(dataset_path)

        df_raw = df[features + [target_col]].dropna()
        X_raw, y_raw = df_raw[features], df_raw[target_col].values

        for variant_label, X, y in [("preprocessed", X_pre, y_pre), ("raw", X_raw, y_raw)]:
            if model_name == "CatBoost" and variant_label == "preprocessed":
                continue

            print(f"\n🔍 Variant: {variant_label}")
            results[model_name][scope_name][variant_label] = {}
            cv_results[model_name][scope_name][variant_label] = {}

            # --- Train/Test Splits ---
            for split_label, test_size in [('80/20', 0.20), ('70/30', 0.30)]:
                print(f"🔧 Split: {split_label}")

                checkpoint_dir = f"checkpoints/{model_name}/{scope_name}/{variant_label}/{split_label}"
                checkpoint_path = f"{checkpoint_dir}/split_result.pkl"
                if os.path.exists(checkpoint_path):
                    print("🛑 Skipping (checkpoint exists)")
                    results[model_name][scope_name][variant_label][split_label] = load_checkpoint(checkpoint_path)
                    continue

                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=test_size, stratify=y, random_state=42
                )

                best_params, acc, f1, best_model, y_pred, y_proba = tune_and_evaluate_split(
                    X_train, y_train, X_test, y_test,
                    model_pipeline, param_grid, return_model=True
                )

                save_evaluation_outputs(model_name, scope_name, variant_label, split_label,
                                        y_test, y_pred, y_proba)

                result_dict = {
                    'best_params': best_params,
                    'test_accuracy': acc,
                    'test_f1': f1
                }
                save_full_checkpoint(
                    model_name, scope_name, variant_label, split_label,
                    best_model, y_test, y_pred, y_proba, best_params
                )
                save_pickle_checkpoint(checkpoint_path, result_dict)
                joblib.dump(best_model, os.path.join(checkpoint_dir, "best_model.pkl"))

                results[model_name][scope_name][variant_label][split_label] = result_dict
                print(f"✅ {split_label} | F1: {f1:.3f} | Accuracy: {acc:.3f}")

            # --- Cross Validation ---
            for cv_label, k in [('CV5', 5), ('CV10', 10)]:
                print(f"🔄 {cv_label} ...", end=" ", flush=True)
                # 🚫 Skip SVC on 'All' scope and CV10 for both variants
                if model_name == "SVC" and scope_name == "All" and cv_label == "CV10":
                    print("🛑 Skipping SVC for All in CV10 (both raw and preprocessed)")
                    continue
                checkpoint_dir = f"checkpoints/{model_name}/{scope_name}/{variant_label}/{cv_label}"
                checkpoint_path = f"{checkpoint_dir}/cv_result.pkl"
                if os.path.exists(checkpoint_path):
                    print("🛑 Skipping (checkpoint exists)")
                    cv_results[model_name][scope_name][variant_label][cv_label] = load_checkpoint(checkpoint_path)
                    continue

                best_params_cv, mean_cv_f1, acc_cv, y_true_cv, y_pred_cv, y_proba_cv = tune_full_cv(
                    X, y, model_pipeline, param_grid, cv_splits=k, return_preds=True
                )

                save_evaluation_outputs(model_name, scope_name, variant_label, cv_label,
                                        y_true_cv, y_pred_cv, y_proba_cv)

                result_dict = {
                    'best_params': best_params_cv,
                    'mean_cv_f1': mean_cv_f1,
                    'mean_cv_accuracy': acc_cv
                }
                save_full_checkpoint(
                    model_name, scope_name, variant_label, split_label,
                    best_model, y_test, y_pred, y_proba, best_params
                )
                save_pickle_checkpoint(checkpoint_path, result_dict)
                joblib.dump(model_pipeline, os.path.join(checkpoint_dir, "cv_model.pkl"))

                cv_results[model_name][scope_name][variant_label][cv_label] = result_dict
                print(f"✅ Mean CV F1: {mean_cv_f1:.3f} | Accuracy: {acc_cv:.3f}")

    print(f"✅ Finished MODEL: {model_name} in {time.time() - model_start:.1f} seconds")



🚀 MODEL: LogisticRegression

📀 Scope: 60s

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

📀 Scope: 70s

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

📀 Scope: 80s

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoi

🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

📀 Scope: 10s

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

📀 Scope: All

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)
✅ Finished MODEL: SGD in 0.1 seconds

🚀 MODEL: LinearSVC

📀 Scope: 60s

🔍 Variant: preprocessed
🔧 Split: 80

🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

📀 Scope: 90s

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

📀 Scope: 00s

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)

🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

📀 Scope: 70s

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

📀 Scope: 80s

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)

🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

📀 Scope: All

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)
✅ Finished MODEL: XGBoost in 0.1 seconds

🚀 MODEL: HistGradientBoosting

📀 Scope: 60s

🔍 Variant: preprocessed
🔧 Split: 80/20
🛑 Skipping (checkpoint exists)
🔧 Split: 70/30
🛑 Skipping (checkpoint exists)
🔄 CV5 ... 🛑 Skipping (checkpoint exists)
🔄 CV10 ... 🛑 Skipping (checkpoint exists)

🔍 Variant: raw
🔧 Split: 80/20
🛑 Sk

  _, ax = plt.subplots()


✅ Mean CV F1: 0.857 | Accuracy: 0.851

📀 Scope: 00s

🔍 Variant: raw
🔧 Split: 80/20


  plt.figure()


✅ 80/20 | F1: 0.874 | Accuracy: 0.868
🔧 Split: 70/30
✅ 70/30 | F1: 0.877 | Accuracy: 0.872
🔄 CV5 ... ✅ Mean CV F1: 0.864 | Accuracy: 0.859
🔄 CV10 ... ✅ Mean CV F1: 0.866 | Accuracy: 0.861

📀 Scope: 10s

🔍 Variant: raw
🔧 Split: 80/20
✅ 80/20 | F1: 0.859 | Accuracy: 0.852
🔧 Split: 70/30
✅ 70/30 | F1: 0.853 | Accuracy: 0.845
🔄 CV5 ... ✅ Mean CV F1: 0.851 | Accuracy: 0.843
🔄 CV10 ... ✅ Mean CV F1: 0.851 | Accuracy: 0.843

📀 Scope: All

🔍 Variant: raw
🔧 Split: 80/20
✅ 80/20 | F1: 0.797 | Accuracy: 0.786
🔧 Split: 70/30
✅ 70/30 | F1: 0.798 | Accuracy: 0.785
🔄 CV5 ... ✅ Mean CV F1: 0.799 | Accuracy: 0.786
🔄 CV10 ... ✅ Mean CV F1: 0.800 | Accuracy: 0.787
✅ Finished MODEL: CatBoost in 2460.5 seconds


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>


def build_metric_df(results_dict, metric_key, index_labels, split_label, variant):
    rows = []
    for label in index_labels:
        row = {m: results_dict[m][label][variant][split_label][metric_key] for m in model_grids}
        rows.append(row)
    return pd.DataFrame(rows, index=index_labels, columns=model_grids.keys())

def build_cv_df(cv_dict, metric_key, index_labels, cv_label, variant):
    rows = []
    for label in index_labels:
        row = {m: cv_dict[m][label][variant][cv_label][metric_key] for m in model_grids}
        rows.append(row)
    return pd.DataFrame(rows, index=index_labels, columns=model_grids.keys())
def build_best_param_df(results_dict, param_key, index_labels, split_label):
    records = []
    variant_labels = ["raw", "preprocessed"]

    for variant in variant_labels:
        for model in model_grids:
            if variant == "preprocessed" and model == "CatBoost":
                continue
            row = {}
            for label in index_labels:
                try:
                    row[label] = results_dict[model][label][variant][split_label][param_key]
                except KeyError:
                    row[label] = None
            records.append(pd.Series(row, name=f"{model}_{variant}"))
    return pd.DataFrame(records)


decade_labels = list(decade_dfs.keys()) + ["All"]
for variant in ["raw", "preprocessed"]:
    print(f"\n=== Variant: {variant.upper()} ===")
    acc_80_20 = build_metric_df(results, 'test_accuracy', decade_labels, '80/20', variant)
    f1_80_20  = build_metric_df(results, 'test_f1', decade_labels, '80/20', variant)
    display(f1_80_20.idxmax(axis=1).to_frame(name="Best Model (F1)"))

acc_80_20 = build_metric_df(results, 'test_accuracy', decade_labels, '80/20')
f1_80_20  = build_metric_df(results, 'test_f1', decade_labels, '80/20')
acc_70_30 = build_metric_df(results, 'test_accuracy', decade_labels, '70/30')
f1_70_30  = build_metric_df(results, 'test_f1', decade_labels, '70/30')

print("=== Test Accuracy (80/20 Split) ===")
display(acc_80_20)
print("\n=== Test F1 (80/20 Split) ===")
display(f1_80_20)
print("\n=== Test Accuracy (70/30 Split) ===")
display(acc_70_30)
print("\n=== Test F1 (70/30 Split) ===")
display(f1_70_30)

print("\n=== Best Model per Decade (80/20 by F1) ===")
display(f1_80_20.idxmax(axis=1).to_frame(name="Best Model"))
print("\n=== Best Model per Decade (70/30 by F1) ===")
display(f1_70_30.idxmax(axis=1).to_frame(name="Best Model"))

cv5_df  = build_cv_df(cv_results, 'mean_cv_f1', decade_labels, 'CV5')
cv10_df = build_cv_df(cv_results, 'mean_cv_f1', decade_labels, 'CV10')

print("\n=== Mean CV F1 (5-Fold) ===")
display(cv5_df)
print("\n=== Mean CV F1 (10-Fold) ===")
display(cv10_df)

print("\n=== Best Model per Decade (5-Fold CV) ===")
display(cv5_df.idxmax(axis=1).to_frame(name="Best Model"))
print("\n=== Best Model per Decade (10-Fold CV) ===")
display(cv10_df.idxmax(axis=1).to_frame(name="Best Model"))

print("\n=== Best Hyperparameters (80/20) ===")
display(build_best_param_df(results, 'best_params', decade_labels, '80/20'))


print("\n=== Best Hyperparameters (70/30) ===")
display(build_best_param_df(results, 'best_params', decade_labels, '70/30'))

print("\n=== Best Hyperparameters (5-Fold CV) ===")
display(build_best_param_df(cv_results, 'best_params', decade_labels, 'CV5'))

print("\n=== Best Hyperparameters (10-Fold CV) ===")
display(build_best_param_df(results, 'best_params', decade_labels, 'CV10'))

print("Hyperparameter tuning summary done.")


In [36]:
variant_labels = ["raw", "preprocessed"]

best_params_per_decade = {}

for strategy in ['70/30', '80/20', 'CV5', 'CV10']:
    best_params_per_decade[strategy] = {}

    for scope_name in decade_dfs.keys():
        best_params_per_decade[strategy][scope_name] = {}

        for variant in variant_labels:
            if variant == "preprocessed" and "CatBoost" in model_grids:
                continue  # CatBoost only uses raw

            best_params_per_decade[strategy][scope_name][variant] = {}

            for model_name in model_grids:
                try:
                    if strategy in ['70/30', '80/20']:
                        src = results
                    else:
                        src = cv_results

                    best_params = src[model_name][scope_name][variant][strategy]['best_params']
                    best_params_per_decade[strategy][scope_name][variant][model_name] = best_params
                except KeyError:
                    print(f"⚠️  Missing: {model_name} | {scope_name} | {variant} | {strategy}")
                    best_params_per_decade[strategy][scope_name][variant][model_name] = None

best_params_overall = {}

for strategy in ['70/30', '80/20', 'CV5', 'CV10']:
    # Skip strategy if not in any model's cv_results['All']
    if strategy in ['CV5', 'CV10']:
        has_all = any('All' in cv_results.get(model, {}) and strategy in cv_results[model]['All'].get(variant, {}) 
                      for model in model_grids for variant in variant_labels)
        if not has_all:
            best_params_overall[strategy] = None
            continue

    best_params_overall[strategy] = {}

    for variant in variant_labels:
        if variant == "preprocessed" and "CatBoost" in model_grids:
            continue

        best_params_overall[strategy][variant] = {}

        for model_name in model_grids:
            # 🚫 Skip SVC on overall CV10
            if strategy == "CV10" and model_name == "SVC":
                print(f"🛑 Skipping SVC in CV10 overall for variant: {variant}")
                best_params_overall[strategy][variant][model_name] = None
                continue

            try:
                src = results if strategy in ['70/30', '80/20'] else cv_results
                best_params_overall[strategy][variant][model_name] = \
                    src[model_name]['All'][variant][strategy]['best_params']
            except KeyError:
                best_params_overall[strategy][variant][model_name] = None

import os
import json

def save_json(data, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        json.dump(data, f, indent=4)

# Save per-decade best params
for strategy in ['70/30', '80/20', 'CV5', 'CV10']:
    if best_params_per_decade.get(strategy):
        for decade, variant_dict in best_params_per_decade[strategy].items():
            for variant, model_dict in variant_dict.items():
                save_path = f"best_params/per_decade/{strategy}/{decade}/{variant}.json"
                save_json(model_dict, save_path)

# Save overall best params
for strategy in ['70/30', '80/20', 'CV5', 'CV10']:
    if best_params_overall.get(strategy):
        for variant, model_dict in best_params_overall[strategy].items():
            # 🛑 Skip SVC if CV10
            if strategy == "CV10" and "SVC" in model_dict:
                model_dict.pop("SVC", None)
            save_path = f"best_params/overall/{strategy}/{variant}.json"
            save_json(model_dict, save_path)

print("✅ All best hyperparameters saved to 'best_params/' directory")


model_classes = {
    'RandomForest': RandomForestClassifier,
    'ExtraTrees':   ExtraTreesClassifier,
    'GradientBoosting': GradientBoostingClassifier,
    'HistGradientBoosting': HistGradientBoostingClassifier,
    'AdaBoost':     AdaBoostClassifier,
    'LinearSVC':    LinearSVC,
    'SVC':          SVC,
    'SGD':          SGDClassifier,
    'GaussianNB':   GaussianNB,
    'DecisionTree': DecisionTreeClassifier,
    'XGBoost':      XGBClassifier,
    'LightGBM':     LGBMClassifier,
    'CatBoost': CatBoostClassifier
}

print("\nmodel_classes is defined with all supervised models.")

🛑 Skipping SVC in CV10 overall for variant: raw
✅ All best hyperparameters saved to 'best_params/' directory

model_classes is defined with all supervised models.


In [37]:
import os
import joblib
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.inspection import permutation_importance

# === CONFIG ===
CHECKPOINT_FILE = "importance_checkpoint.pkl"

# === FUNCTIONS ===
def strip_model_prefix(params):
    if params is None:
        return {}
    return {k.replace('model__', ''): v for k, v in params.items()}

def calculate_feature_importance(model, X, y, features, model_name):
    t0 = datetime.now()

    if hasattr(model, "feature_importances_"):
        print(f"{datetime.now()}    • Using built-in feature_importances_ for {model_name}")
        raw_imp = model.feature_importances_

    elif hasattr(model, "coef_"):
        print(f"{datetime.now()}    • Using |coef_| for {model_name}")
        raw_imp = np.abs(model.coef_).flatten()

    else:
        print(f"{datetime.now()}    • Computing permutation importance for {model_name}...")
        t1 = datetime.now()
        r = permutation_importance(
            estimator=model,
            X=X,
            y=y,
            n_repeats=5,
            random_state=42,
            n_jobs=-1,
            scoring="f1"
        )
        raw_imp = r.importances_mean
        print(f"{datetime.now()}    • Permutation importance done (took {(datetime.now()-t1).total_seconds():.1f}s)")

    norm_imp = raw_imp / np.sum(raw_imp)
    return pd.Series(norm_imp, index=features), (datetime.now()-t0).total_seconds()

def save_checkpoint():
    joblib.dump(importance_results, CHECKPOINT_FILE)
    print("💾 Checkpoint saved.")

# === PREP DATA ===
for decade_name, df in decade_dfs.items():
    if 'decade' not in df.columns:
        decade = decade_name.split('-')[-1].replace('.csv', '')
        df['decade'] = decade

df_all = pd.concat(decade_dfs.values(), ignore_index=True)
required_columns = features + [target_col] + ['decade']
df_all = df_all[required_columns].dropna()

# === CONFIG ===
strategies = ['70/30', '80/20', 'CV5', 'CV10']
variant_labels = ["raw", "preprocessed"]

# === LOAD CHECKPOINT OR INIT ===
if os.path.exists(CHECKPOINT_FILE):
    print("🔄 Loading previous checkpoint...")
    importance_results = joblib.load(CHECKPOINT_FILE)
else:
    importance_results = {
        'overall': {variant: {} for variant in variant_labels},
        'decades': {
            decade: {
                variant: {
                    strategy: {} for strategy in strategies
                } for variant in variant_labels
            } for decade in df_all['decade'].unique()
        }
    }

# === OVERALL ===
for strategy in strategies:
    print(f"\n=== Strategy: {strategy} ===")
    X_all = df_all[features].values
    y_all = df_all[target_col].values

    for variant in variant_labels:
        if variant == "preprocessed" and "CatBoost" in model_grids:
            continue

        for model_name, ModelClass in model_classes.items():
            if model_name == "SVC" and strategy == "CV10":
                print(f"🛑 Skipping {model_name} for {strategy} due to resource constraints")
                continue

            if model_name in importance_results['overall'][variant].get(strategy, {}):
                print(f"⏭️  Skipping {model_name} ({variant}) for {strategy}, already completed.")
                continue

            print(f"{datetime.now()} ▶ Starting {model_name} ({variant}) [Strategy: {strategy}]")
            params = best_params_overall.get(strategy, {}).get(variant, {}).get(model_name, {})
            model = ModelClass(**strip_model_prefix(params))

            try:
                model.fit(X_all, y_all)
                imp_series, time_taken = calculate_feature_importance(model, X_all, y_all, features, model_name)
                importance_results['overall'][variant].setdefault(strategy, {})[model_name] = imp_series
                save_checkpoint()
                print(f"{datetime.now()} ✔ Completed {model_name} ({variant}) (took {time_taken:.1f}s)\n")
            except Exception as e:
                print(f"❌ {model_name} ({variant}) failed on overall data: {e}")

# === PER-DECADE ===
for strategy in strategies:
    print(f"\n=== Strategy: {strategy} (Per-Decade) ===")

    for decade in df_all['decade'].unique():
        df_decade = df_all[df_all['decade'] == decade]
        X_decade = df_decade[features].values
        y_decade = df_decade[target_col].values

        for variant in variant_labels:
            if variant == "preprocessed" and "CatBoost" in model_grids:
                continue

            for model_name, ModelClass in model_classes.items():
                if model_name in importance_results['decades'][decade][variant][strategy]:
                    print(f"⏭️  Skipping {model_name} ({variant}) for {decade} [{strategy}], already completed.")
                    continue

                print(f"{datetime.now()} ▶ Starting {model_name} ({variant}) for {decade} [Strategy: {strategy}]")
                decade_key = str(decade) if str(decade) in best_params_per_decade.get(strategy, {}) else decade
                params = (
                    best_params_per_decade.get(strategy, {})
                    .get(decade_key, {})
                    .get(variant, {})
                    .get(model_name, {})
                )

                if not isinstance(params, dict) or not params:
                    print(f"⚠️  Skipping {model_name} ({variant}) for {decade} [Strategy: {strategy}] - No parameters found")
                    print(f"Available decades for {strategy}: {list(best_params_per_decade.get(strategy, {}).keys())}")
                    print(f"Checked key: {decade_key}")
                    continue


                model = ModelClass(**strip_model_prefix(params))

                try:
                    model.fit(X_decade, y_decade)
                    imp_series, time_taken = calculate_feature_importance(model, X_decade, y_decade, features, model_name)
                    importance_results['decades'][decade][variant][strategy][model_name] = imp_series
                    save_checkpoint()
                    print(f"{datetime.now()} ✔ Completed {model_name} ({variant}) for {decade} (took {time_taken:.1f}s)\n")
                except Exception as e:
                    print(f"❌ {model_name} ({variant}) failed for {decade}: {e}")

print(f"{datetime.now()} 🏁 All feature importance calculations completed")


🔄 Loading previous checkpoint...

=== Strategy: 70/30 ===
2025-07-09 12:43:17.044721 ▶ Starting RandomForest (raw) [Strategy: 70/30]
2025-07-09 12:44:01.734984    • Using built-in feature_importances_ for RandomForest
💾 Checkpoint saved.
2025-07-09 12:44:01.922139 ✔ Completed RandomForest (raw) (took 0.1s)

2025-07-09 12:44:01.922139 ▶ Starting ExtraTrees (raw) [Strategy: 70/30]
2025-07-09 12:44:16.372657    • Using built-in feature_importances_ for ExtraTrees
💾 Checkpoint saved.
2025-07-09 12:44:16.624585 ✔ Completed ExtraTrees (raw) (took 0.2s)

2025-07-09 12:44:16.624585 ▶ Starting GradientBoosting (raw) [Strategy: 70/30]
2025-07-09 12:45:48.861821    • Using built-in feature_importances_ for GradientBoosting
💾 Checkpoint saved.
2025-07-09 12:45:49.006075 ✔ Completed GradientBoosting (raw) (took 0.0s)

2025-07-09 12:45:49.006075 ▶ Starting HistGradientBoosting (raw) [Strategy: 70/30]
2025-07-09 12:45:50.584547    • Computing permutation importance for HistGradientBoosting...
2025-07

2025-07-09 13:18:15.697674    • Using built-in feature_importances_ for LightGBM
💾 Checkpoint saved.
2025-07-09 13:18:15.847216 ✔ Completed LightGBM (raw) (took 0.0s)

2025-07-09 13:18:15.847216 ▶ Starting CatBoost (raw) [Strategy: 70/30]
0:	learn: 0.6584523	total: 20.3ms	remaining: 4.04s
1:	learn: 0.6295893	total: 38.4ms	remaining: 3.81s
2:	learn: 0.6074948	total: 56.9ms	remaining: 3.73s
3:	learn: 0.5880858	total: 75.3ms	remaining: 3.69s
4:	learn: 0.5726839	total: 95ms	remaining: 3.71s
5:	learn: 0.5595511	total: 113ms	remaining: 3.64s
6:	learn: 0.5491053	total: 133ms	remaining: 3.65s
7:	learn: 0.5407770	total: 151ms	remaining: 3.63s
8:	learn: 0.5341163	total: 170ms	remaining: 3.6s
9:	learn: 0.5276722	total: 191ms	remaining: 3.63s
10:	learn: 0.5220409	total: 210ms	remaining: 3.6s
11:	learn: 0.5175008	total: 228ms	remaining: 3.56s
12:	learn: 0.5131574	total: 247ms	remaining: 3.56s
13:	learn: 0.5098670	total: 266ms	remaining: 3.54s
14:	learn: 0.5066388	total: 285ms	remaining: 3.51s
15:	l

111:	learn: 0.4459865	total: 2.11s	remaining: 1.66s
112:	learn: 0.4456817	total: 2.13s	remaining: 1.64s
113:	learn: 0.4455028	total: 2.15s	remaining: 1.62s
114:	learn: 0.4451749	total: 2.17s	remaining: 1.6s
115:	learn: 0.4449665	total: 2.19s	remaining: 1.58s
116:	learn: 0.4447059	total: 2.21s	remaining: 1.56s
117:	learn: 0.4444170	total: 2.22s	remaining: 1.54s
118:	learn: 0.4442143	total: 2.24s	remaining: 1.52s
119:	learn: 0.4438298	total: 2.26s	remaining: 1.51s
120:	learn: 0.4435879	total: 2.28s	remaining: 1.49s
121:	learn: 0.4434105	total: 2.3s	remaining: 1.47s
122:	learn: 0.4431543	total: 2.32s	remaining: 1.45s
123:	learn: 0.4427312	total: 2.33s	remaining: 1.43s
124:	learn: 0.4425310	total: 2.35s	remaining: 1.41s
125:	learn: 0.4422688	total: 2.37s	remaining: 1.39s
126:	learn: 0.4419680	total: 2.39s	remaining: 1.37s
127:	learn: 0.4416083	total: 2.41s	remaining: 1.35s
128:	learn: 0.4413147	total: 2.43s	remaining: 1.34s
129:	learn: 0.4410333	total: 2.45s	remaining: 1.32s
130:	learn: 0.



2025-07-09 13:55:15.346158    • Using built-in feature_importances_ for LightGBM
💾 Checkpoint saved.
2025-07-09 13:55:15.486754 ✔ Completed LightGBM (raw) (took 0.0s)

2025-07-09 13:55:15.486754 ▶ Starting CatBoost (raw) [Strategy: 80/20]
0:	learn: 0.6584523	total: 23.5ms	remaining: 4.67s
1:	learn: 0.6295893	total: 45.5ms	remaining: 4.5s
2:	learn: 0.6074948	total: 65.8ms	remaining: 4.32s
3:	learn: 0.5880858	total: 86.9ms	remaining: 4.26s
4:	learn: 0.5726839	total: 111ms	remaining: 4.32s
5:	learn: 0.5595511	total: 132ms	remaining: 4.26s
6:	learn: 0.5491053	total: 152ms	remaining: 4.19s
7:	learn: 0.5407770	total: 173ms	remaining: 4.15s
8:	learn: 0.5341163	total: 193ms	remaining: 4.09s
9:	learn: 0.5276722	total: 215ms	remaining: 4.08s
10:	learn: 0.5220409	total: 237ms	remaining: 4.07s
11:	learn: 0.5175008	total: 266ms	remaining: 4.17s
12:	learn: 0.5131574	total: 288ms	remaining: 4.15s
13:	learn: 0.5098670	total: 312ms	remaining: 4.15s
14:	learn: 0.5066388	total: 335ms	remaining: 4.14s
15:

165:	learn: 0.4335413	total: 3.48s	remaining: 713ms
166:	learn: 0.4333835	total: 3.5s	remaining: 692ms
167:	learn: 0.4332021	total: 3.52s	remaining: 671ms
168:	learn: 0.4329899	total: 3.54s	remaining: 650ms
169:	learn: 0.4327936	total: 3.56s	remaining: 628ms
170:	learn: 0.4326599	total: 3.58s	remaining: 607ms
171:	learn: 0.4324635	total: 3.6s	remaining: 587ms
172:	learn: 0.4322635	total: 3.62s	remaining: 566ms
173:	learn: 0.4321237	total: 3.65s	remaining: 545ms
174:	learn: 0.4319382	total: 3.66s	remaining: 524ms
175:	learn: 0.4317755	total: 3.69s	remaining: 503ms
176:	learn: 0.4316009	total: 3.71s	remaining: 482ms
177:	learn: 0.4314345	total: 3.73s	remaining: 461ms
178:	learn: 0.4312857	total: 3.75s	remaining: 440ms
179:	learn: 0.4310503	total: 3.77s	remaining: 419ms
180:	learn: 0.4308637	total: 3.79s	remaining: 398ms
181:	learn: 0.4306831	total: 3.81s	remaining: 377ms
182:	learn: 0.4305424	total: 3.83s	remaining: 356ms
183:	learn: 0.4303499	total: 3.85s	remaining: 335ms
184:	learn: 0.

💾 Checkpoint saved.
2025-07-09 14:30:14.594161 ✔ Completed LightGBM (raw) (took 0.0s)

2025-07-09 14:30:14.594161 ▶ Starting CatBoost (raw) [Strategy: CV5]
0:	learn: 0.6584523	total: 23.9ms	remaining: 4.76s
1:	learn: 0.6295893	total: 45.7ms	remaining: 4.52s
2:	learn: 0.6074948	total: 67.6ms	remaining: 4.44s
3:	learn: 0.5880858	total: 88.1ms	remaining: 4.31s
4:	learn: 0.5726839	total: 109ms	remaining: 4.25s
5:	learn: 0.5595511	total: 131ms	remaining: 4.22s
6:	learn: 0.5491053	total: 151ms	remaining: 4.15s
7:	learn: 0.5407770	total: 171ms	remaining: 4.1s
8:	learn: 0.5341163	total: 190ms	remaining: 4.03s
9:	learn: 0.5276722	total: 212ms	remaining: 4.02s
10:	learn: 0.5220409	total: 234ms	remaining: 4.01s
11:	learn: 0.5175008	total: 255ms	remaining: 3.99s
12:	learn: 0.5131574	total: 278ms	remaining: 4s
13:	learn: 0.5098670	total: 301ms	remaining: 3.99s
14:	learn: 0.5066388	total: 322ms	remaining: 3.97s
15:	learn: 0.5030874	total: 343ms	remaining: 3.94s
16:	learn: 0.5002090	total: 365ms	rema

167:	learn: 0.4332021	total: 3.37s	remaining: 641ms
168:	learn: 0.4329899	total: 3.39s	remaining: 621ms
169:	learn: 0.4327936	total: 3.4s	remaining: 601ms
170:	learn: 0.4326599	total: 3.42s	remaining: 580ms
171:	learn: 0.4324635	total: 3.44s	remaining: 560ms
172:	learn: 0.4322635	total: 3.46s	remaining: 540ms
173:	learn: 0.4321237	total: 3.47s	remaining: 519ms
174:	learn: 0.4319382	total: 3.49s	remaining: 499ms
175:	learn: 0.4317755	total: 3.51s	remaining: 479ms
176:	learn: 0.4316009	total: 3.53s	remaining: 459ms
177:	learn: 0.4314345	total: 3.55s	remaining: 438ms
178:	learn: 0.4312857	total: 3.56s	remaining: 418ms
179:	learn: 0.4310503	total: 3.58s	remaining: 398ms
180:	learn: 0.4308637	total: 3.6s	remaining: 378ms
181:	learn: 0.4306831	total: 3.62s	remaining: 358ms
182:	learn: 0.4305424	total: 3.64s	remaining: 338ms
183:	learn: 0.4303499	total: 3.65s	remaining: 318ms
184:	learn: 0.4301746	total: 3.67s	remaining: 298ms
185:	learn: 0.4299425	total: 3.69s	remaining: 278ms
186:	learn: 0.

2025-07-09 14:32:38.274010    • Using built-in feature_importances_ for LightGBM
💾 Checkpoint saved.
2025-07-09 14:32:38.422286 ✔ Completed LightGBM (raw) (took 0.0s)

2025-07-09 14:32:38.422286 ▶ Starting CatBoost (raw) [Strategy: CV10]
0:	learn: 0.6584523	total: 22ms	remaining: 4.38s
1:	learn: 0.6295893	total: 42.2ms	remaining: 4.18s
2:	learn: 0.6074948	total: 62.5ms	remaining: 4.11s
3:	learn: 0.5880858	total: 83.2ms	remaining: 4.07s
4:	learn: 0.5726839	total: 104ms	remaining: 4.07s
5:	learn: 0.5595511	total: 123ms	remaining: 3.98s
6:	learn: 0.5491053	total: 143ms	remaining: 3.95s
7:	learn: 0.5407770	total: 164ms	remaining: 3.94s
8:	learn: 0.5341163	total: 183ms	remaining: 3.87s
9:	learn: 0.5276722	total: 203ms	remaining: 3.85s
10:	learn: 0.5220409	total: 224ms	remaining: 3.85s
11:	learn: 0.5175008	total: 243ms	remaining: 3.81s
12:	learn: 0.5131574	total: 265ms	remaining: 3.81s
13:	learn: 0.5098670	total: 286ms	remaining: 3.8s
14:	learn: 0.5066388	total: 308ms	remaining: 3.8s
15:	lea

101:	learn: 0.4482599	total: 2.06s	remaining: 1.98s
102:	learn: 0.4481146	total: 2.08s	remaining: 1.96s
103:	learn: 0.4479256	total: 2.1s	remaining: 1.94s
104:	learn: 0.4477025	total: 2.12s	remaining: 1.92s
105:	learn: 0.4474125	total: 2.14s	remaining: 1.9s
106:	learn: 0.4472086	total: 2.16s	remaining: 1.88s
107:	learn: 0.4470095	total: 2.19s	remaining: 1.86s
108:	learn: 0.4466737	total: 2.21s	remaining: 1.84s
109:	learn: 0.4463939	total: 2.22s	remaining: 1.82s
110:	learn: 0.4462025	total: 2.24s	remaining: 1.8s
111:	learn: 0.4459865	total: 2.26s	remaining: 1.78s
112:	learn: 0.4456817	total: 2.28s	remaining: 1.76s
113:	learn: 0.4455028	total: 2.3s	remaining: 1.74s
114:	learn: 0.4451749	total: 2.32s	remaining: 1.72s
115:	learn: 0.4449665	total: 2.34s	remaining: 1.7s
116:	learn: 0.4447059	total: 2.36s	remaining: 1.68s
117:	learn: 0.4444170	total: 2.38s	remaining: 1.66s
118:	learn: 0.4442143	total: 2.4s	remaining: 1.63s
119:	learn: 0.4438298	total: 2.42s	remaining: 1.61s
120:	learn: 0.4435

2025-07-09 14:34:23.451788    • Using built-in feature_importances_ for LightGBM


💾 Checkpoint saved.
2025-07-09 14:34:23.627705 ✔ Completed LightGBM (raw) for 60s (took 0.0s)

2025-07-09 14:34:23.627705 ▶ Starting CatBoost (raw) for 60s [Strategy: 70/30]
0:	learn: 0.6601379	total: 6.02ms	remaining: 1.2s
1:	learn: 0.6332726	total: 12ms	remaining: 1.18s
2:	learn: 0.6125302	total: 17.7ms	remaining: 1.17s
3:	learn: 0.5936747	total: 23ms	remaining: 1.13s
4:	learn: 0.5784559	total: 28.3ms	remaining: 1.1s
5:	learn: 0.5649885	total: 33.2ms	remaining: 1.07s
6:	learn: 0.5559960	total: 39.5ms	remaining: 1.09s
7:	learn: 0.5473908	total: 45.1ms	remaining: 1.08s
8:	learn: 0.5404374	total: 51.2ms	remaining: 1.09s
9:	learn: 0.5342858	total: 57.2ms	remaining: 1.09s
10:	learn: 0.5294068	total: 62.7ms	remaining: 1.08s
11:	learn: 0.5246463	total: 69.9ms	remaining: 1.09s
12:	learn: 0.5200977	total: 76.6ms	remaining: 1.1s
13:	learn: 0.5159464	total: 82.6ms	remaining: 1.1s
14:	learn: 0.5126841	total: 89.3ms	remaining: 1.1s
15:	learn: 0.5094143	total: 95.6ms	remaining: 1.1s
16:	learn: 0.5

171:	learn: 0.3839859	total: 1.01s	remaining: 165ms
172:	learn: 0.3836515	total: 1.02s	remaining: 159ms
173:	learn: 0.3831626	total: 1.02s	remaining: 153ms
174:	learn: 0.3825221	total: 1.03s	remaining: 147ms
175:	learn: 0.3821162	total: 1.03s	remaining: 141ms
176:	learn: 0.3815414	total: 1.04s	remaining: 135ms
177:	learn: 0.3809369	total: 1.04s	remaining: 129ms
178:	learn: 0.3804774	total: 1.05s	remaining: 123ms
179:	learn: 0.3799728	total: 1.05s	remaining: 117ms
180:	learn: 0.3796415	total: 1.06s	remaining: 111ms
181:	learn: 0.3791448	total: 1.06s	remaining: 105ms
182:	learn: 0.3784260	total: 1.07s	remaining: 99.4ms
183:	learn: 0.3778994	total: 1.08s	remaining: 93.6ms
184:	learn: 0.3775212	total: 1.08s	remaining: 87.7ms
185:	learn: 0.3771286	total: 1.09s	remaining: 81.9ms
186:	learn: 0.3766183	total: 1.09s	remaining: 76.1ms
187:	learn: 0.3760714	total: 1.1s	remaining: 70.2ms
188:	learn: 0.3755399	total: 1.11s	remaining: 64.4ms
189:	learn: 0.3749968	total: 1.11s	remaining: 58.6ms
190:	

💾 Checkpoint saved.
2025-07-09 14:35:59.565418 ✔ Completed LightGBM (raw) for 70s (took 0.0s)

2025-07-09 14:35:59.565418 ▶ Starting CatBoost (raw) for 70s [Strategy: 70/30]
0:	learn: 0.6589696	total: 5.62ms	remaining: 1.12s
1:	learn: 0.6327485	total: 11.2ms	remaining: 1.1s
2:	learn: 0.6121604	total: 17.1ms	remaining: 1.12s
3:	learn: 0.5956490	total: 22.6ms	remaining: 1.11s
4:	learn: 0.5800557	total: 28.2ms	remaining: 1.1s
5:	learn: 0.5694093	total: 33.5ms	remaining: 1.08s
6:	learn: 0.5593082	total: 38.1ms	remaining: 1.05s
7:	learn: 0.5505620	total: 42.8ms	remaining: 1.03s
8:	learn: 0.5417803	total: 48ms	remaining: 1.02s
9:	learn: 0.5351542	total: 52.5ms	remaining: 997ms
10:	learn: 0.5299465	total: 56.9ms	remaining: 978ms
11:	learn: 0.5248109	total: 61.3ms	remaining: 960ms
12:	learn: 0.5201762	total: 65.7ms	remaining: 945ms
13:	learn: 0.5157142	total: 70.4ms	remaining: 935ms
14:	learn: 0.5117580	total: 74.4ms	remaining: 918ms
15:	learn: 0.5088400	total: 78.8ms	remaining: 907ms
16:	lear

172:	learn: 0.3774246	total: 960ms	remaining: 150ms
173:	learn: 0.3767453	total: 966ms	remaining: 144ms
174:	learn: 0.3761223	total: 972ms	remaining: 139ms
175:	learn: 0.3754275	total: 977ms	remaining: 133ms
176:	learn: 0.3749211	total: 983ms	remaining: 128ms
177:	learn: 0.3745402	total: 989ms	remaining: 122ms
178:	learn: 0.3741115	total: 995ms	remaining: 117ms
179:	learn: 0.3734226	total: 1s	remaining: 111ms
180:	learn: 0.3727709	total: 1.01s	remaining: 106ms
181:	learn: 0.3721375	total: 1.01s	remaining: 100ms
182:	learn: 0.3717588	total: 1.02s	remaining: 94.8ms
183:	learn: 0.3713374	total: 1.03s	remaining: 89.2ms
184:	learn: 0.3708698	total: 1.03s	remaining: 83.7ms
185:	learn: 0.3702205	total: 1.04s	remaining: 78.1ms
186:	learn: 0.3697777	total: 1.04s	remaining: 72.6ms
187:	learn: 0.3691617	total: 1.05s	remaining: 67ms
188:	learn: 0.3685917	total: 1.05s	remaining: 61.5ms
189:	learn: 0.3682010	total: 1.06s	remaining: 55.9ms
190:	learn: 0.3675387	total: 1.07s	remaining: 50.3ms
191:	lea

2025-07-09 14:37:11.016531    • Using built-in feature_importances_ for LightGBM


💾 Checkpoint saved.
2025-07-09 14:37:11.179274 ✔ Completed LightGBM (raw) for 80s (took 0.0s)

2025-07-09 14:37:11.179274 ▶ Starting CatBoost (raw) for 80s [Strategy: 70/30]
0:	learn: 0.6575415	total: 5.21ms	remaining: 1.04s
1:	learn: 0.6332170	total: 9.63ms	remaining: 953ms
2:	learn: 0.6114710	total: 27ms	remaining: 1.77s
3:	learn: 0.5957087	total: 30.7ms	remaining: 1.51s
4:	learn: 0.5811073	total: 36.3ms	remaining: 1.42s
5:	learn: 0.5700199	total: 40.9ms	remaining: 1.32s
6:	learn: 0.5602376	total: 45ms	remaining: 1.24s
7:	learn: 0.5509140	total: 50ms	remaining: 1.2s
8:	learn: 0.5416350	total: 54.2ms	remaining: 1.15s
9:	learn: 0.5339259	total: 57.9ms	remaining: 1.1s
10:	learn: 0.5266463	total: 61.3ms	remaining: 1.05s
11:	learn: 0.5214015	total: 64.7ms	remaining: 1.01s
12:	learn: 0.5163900	total: 69.5ms	remaining: 999ms
13:	learn: 0.5134461	total: 73.1ms	remaining: 971ms
14:	learn: 0.5088769	total: 78.3ms	remaining: 965ms
15:	learn: 0.5047838	total: 82.7ms	remaining: 951ms
16:	learn: 0

182:	learn: 0.3899214	total: 773ms	remaining: 71.8ms
183:	learn: 0.3896196	total: 777ms	remaining: 67.5ms
184:	learn: 0.3893251	total: 780ms	remaining: 63.3ms
185:	learn: 0.3889364	total: 787ms	remaining: 59.3ms
186:	learn: 0.3886122	total: 791ms	remaining: 55ms
187:	learn: 0.3883446	total: 794ms	remaining: 50.7ms
188:	learn: 0.3880284	total: 798ms	remaining: 46.5ms
189:	learn: 0.3877672	total: 801ms	remaining: 42.2ms
190:	learn: 0.3874158	total: 806ms	remaining: 38ms
191:	learn: 0.3871308	total: 811ms	remaining: 33.8ms
192:	learn: 0.3868770	total: 815ms	remaining: 29.6ms
193:	learn: 0.3867234	total: 820ms	remaining: 25.4ms
194:	learn: 0.3864578	total: 825ms	remaining: 21.1ms
195:	learn: 0.3860532	total: 829ms	remaining: 16.9ms
196:	learn: 0.3858603	total: 834ms	remaining: 12.7ms
197:	learn: 0.3855467	total: 838ms	remaining: 8.46ms
198:	learn: 0.3853302	total: 842ms	remaining: 4.23ms
199:	learn: 0.3850621	total: 847ms	remaining: 0us
2025-07-09 14:37:12.111333    • Using built-in featur

💾 Checkpoint saved.
2025-07-09 14:37:57.492211 ✔ Completed LightGBM (raw) for 90s (took 0.0s)

2025-07-09 14:37:57.492211 ▶ Starting CatBoost (raw) for 90s [Strategy: 70/30]
0:	learn: 0.6608297	total: 7.23ms	remaining: 1.44s
1:	learn: 0.6307599	total: 13ms	remaining: 1.28s
2:	learn: 0.6049156	total: 18.7ms	remaining: 1.23s
3:	learn: 0.5800756	total: 24.3ms	remaining: 1.19s
4:	learn: 0.5606913	total: 30.5ms	remaining: 1.19s
5:	learn: 0.5435634	total: 35.4ms	remaining: 1.14s
6:	learn: 0.5295114	total: 39.9ms	remaining: 1.1s
7:	learn: 0.5148547	total: 44.2ms	remaining: 1.06s
8:	learn: 0.5026442	total: 48.5ms	remaining: 1.03s
9:	learn: 0.4905679	total: 52.9ms	remaining: 1s
10:	learn: 0.4806768	total: 57.3ms	remaining: 985ms
11:	learn: 0.4713690	total: 61.4ms	remaining: 962ms
12:	learn: 0.4638794	total: 66.2ms	remaining: 952ms
13:	learn: 0.4554325	total: 70.4ms	remaining: 936ms
14:	learn: 0.4480985	total: 75.1ms	remaining: 926ms
15:	learn: 0.4410085	total: 79.4ms	remaining: 913ms
16:	learn:

193:	learn: 0.2944447	total: 962ms	remaining: 29.8ms
194:	learn: 0.2941441	total: 968ms	remaining: 24.8ms
195:	learn: 0.2937607	total: 973ms	remaining: 19.9ms
196:	learn: 0.2935002	total: 978ms	remaining: 14.9ms
197:	learn: 0.2931689	total: 983ms	remaining: 9.93ms
198:	learn: 0.2928616	total: 988ms	remaining: 4.96ms
199:	learn: 0.2925887	total: 992ms	remaining: 0us
2025-07-09 14:37:58.569316    • Using built-in feature_importances_ for CatBoost
💾 Checkpoint saved.
2025-07-09 14:37:58.777303 ✔ Completed CatBoost (raw) for 90s (took 0.0s)

2025-07-09 14:37:58.785442 ▶ Starting RandomForest (raw) for 00s [Strategy: 70/30]
2025-07-09 14:38:03.922697    • Using built-in feature_importances_ for RandomForest
💾 Checkpoint saved.
2025-07-09 14:38:04.320708 ✔ Completed RandomForest (raw) for 00s (took 0.1s)

2025-07-09 14:38:04.320708 ▶ Starting ExtraTrees (raw) for 00s [Strategy: 70/30]
2025-07-09 14:38:05.470707    • Using built-in feature_importances_ for ExtraTrees
💾 Checkpoint saved.
2025-

💾 Checkpoint saved.
2025-07-09 14:38:49.462608 ✔ Completed LightGBM (raw) for 00s (took 0.0s)

2025-07-09 14:38:49.462608 ▶ Starting CatBoost (raw) for 00s [Strategy: 70/30]
0:	learn: 0.6291804	total: 7.63ms	remaining: 1.52s
1:	learn: 0.5840525	total: 13.3ms	remaining: 1.31s
2:	learn: 0.5468364	total: 19.5ms	remaining: 1.28s
3:	learn: 0.5168172	total: 25.6ms	remaining: 1.25s
4:	learn: 0.4945318	total: 31.1ms	remaining: 1.21s
5:	learn: 0.4752991	total: 37.6ms	remaining: 1.22s
6:	learn: 0.4567337	total: 43.7ms	remaining: 1.2s
7:	learn: 0.4432087	total: 49.7ms	remaining: 1.19s
8:	learn: 0.4305474	total: 54.9ms	remaining: 1.16s
9:	learn: 0.4213552	total: 60.1ms	remaining: 1.14s
10:	learn: 0.4128118	total: 65ms	remaining: 1.12s
11:	learn: 0.4061927	total: 69.9ms	remaining: 1.09s
12:	learn: 0.3982092	total: 74.7ms	remaining: 1.07s
13:	learn: 0.3931620	total: 79.1ms	remaining: 1.05s
14:	learn: 0.3884384	total: 83.5ms	remaining: 1.03s
15:	learn: 0.3839255	total: 88.4ms	remaining: 1.02s
16:	lea

192:	learn: 0.2318322	total: 970ms	remaining: 35.2ms
193:	learn: 0.2312157	total: 974ms	remaining: 30.1ms
194:	learn: 0.2309222	total: 979ms	remaining: 25.1ms
195:	learn: 0.2305960	total: 984ms	remaining: 20.1ms
196:	learn: 0.2301873	total: 989ms	remaining: 15.1ms
197:	learn: 0.2296828	total: 995ms	remaining: 10ms
198:	learn: 0.2291251	total: 1s	remaining: 5.03ms
199:	learn: 0.2284602	total: 1.01s	remaining: 0us
2025-07-09 14:38:50.558112    • Using built-in feature_importances_ for CatBoost
💾 Checkpoint saved.
2025-07-09 14:38:50.779108 ✔ Completed CatBoost (raw) for 00s (took 0.0s)

2025-07-09 14:38:50.788107 ▶ Starting RandomForest (raw) for 10s [Strategy: 70/30]
2025-07-09 14:38:57.286852    • Using built-in feature_importances_ for RandomForest
💾 Checkpoint saved.
2025-07-09 14:38:57.545071 ✔ Completed RandomForest (raw) for 10s (took 0.1s)

2025-07-09 14:38:57.545071 ▶ Starting ExtraTrees (raw) for 10s [Strategy: 70/30]
2025-07-09 14:38:59.875391    • Using built-in feature_impor

💾 Checkpoint saved.
2025-07-09 14:39:54.938389 ✔ Completed LightGBM (raw) for 10s (took 0.0s)

2025-07-09 14:39:54.938389 ▶ Starting CatBoost (raw) for 10s [Strategy: 70/30]
0:	learn: 0.6659181	total: 5.98ms	remaining: 1.19s
1:	learn: 0.6349384	total: 11.2ms	remaining: 1.11s
2:	learn: 0.6097384	total: 16.6ms	remaining: 1.09s
3:	learn: 0.5893985	total: 22.1ms	remaining: 1.08s
4:	learn: 0.5733413	total: 26.7ms	remaining: 1.04s
5:	learn: 0.5567765	total: 32.5ms	remaining: 1.05s
6:	learn: 0.5400394	total: 38.4ms	remaining: 1.06s
7:	learn: 0.5260598	total: 43.6ms	remaining: 1.05s
8:	learn: 0.5134171	total: 48.9ms	remaining: 1.04s
9:	learn: 0.5032709	total: 53.3ms	remaining: 1.01s
10:	learn: 0.4941083	total: 58.4ms	remaining: 1s
11:	learn: 0.4861387	total: 63.8ms	remaining: 1000ms
12:	learn: 0.4769323	total: 68.4ms	remaining: 983ms
13:	learn: 0.4699920	total: 72.8ms	remaining: 967ms
14:	learn: 0.4645830	total: 78.3ms	remaining: 966ms
15:	learn: 0.4584101	total: 84.3ms	remaining: 970ms
16:	le

187:	learn: 0.3109288	total: 984ms	remaining: 62.8ms
188:	learn: 0.3106295	total: 989ms	remaining: 57.6ms
189:	learn: 0.3103408	total: 995ms	remaining: 52.3ms
190:	learn: 0.3099779	total: 1s	remaining: 47.1ms
191:	learn: 0.3097030	total: 1s	remaining: 41.9ms
192:	learn: 0.3093405	total: 1.01s	remaining: 36.7ms
193:	learn: 0.3089296	total: 1.02s	remaining: 31.4ms
194:	learn: 0.3085324	total: 1.02s	remaining: 26.2ms
195:	learn: 0.3080833	total: 1.03s	remaining: 21ms
196:	learn: 0.3077279	total: 1.03s	remaining: 15.7ms
197:	learn: 0.3074320	total: 1.04s	remaining: 10.5ms
198:	learn: 0.3070819	total: 1.04s	remaining: 5.24ms
199:	learn: 0.3066969	total: 1.05s	remaining: 0us
2025-07-09 14:39:56.078714    • Using built-in feature_importances_ for CatBoost
💾 Checkpoint saved.
2025-07-09 14:39:56.300550 ✔ Completed CatBoost (raw) for 10s (took 0.0s)


=== Strategy: 80/20 (Per-Decade) ===
2025-07-09 14:39:56.312907 ▶ Starting RandomForest (raw) for 60s [Strategy: 80/20]
2025-07-09 14:40:04.64946

💾 Checkpoint saved.
2025-07-09 14:41:39.894733 ✔ Completed LightGBM (raw) for 60s (took 0.0s)

2025-07-09 14:41:39.894733 ▶ Starting CatBoost (raw) for 60s [Strategy: 80/20]
0:	learn: 0.6601379	total: 6.44ms	remaining: 1.28s
1:	learn: 0.6332726	total: 13ms	remaining: 1.29s
2:	learn: 0.6125302	total: 19.7ms	remaining: 1.3s
3:	learn: 0.5936747	total: 25ms	remaining: 1.23s
4:	learn: 0.5784559	total: 30.6ms	remaining: 1.19s
5:	learn: 0.5649885	total: 36.6ms	remaining: 1.18s
6:	learn: 0.5559960	total: 42.9ms	remaining: 1.18s
7:	learn: 0.5473908	total: 47.7ms	remaining: 1.15s
8:	learn: 0.5404374	total: 54.1ms	remaining: 1.15s
9:	learn: 0.5342858	total: 59.9ms	remaining: 1.14s
10:	learn: 0.5294068	total: 65.4ms	remaining: 1.12s
11:	learn: 0.5246463	total: 71.4ms	remaining: 1.12s
12:	learn: 0.5200977	total: 76.7ms	remaining: 1.1s
13:	learn: 0.5159464	total: 82.2ms	remaining: 1.09s
14:	learn: 0.5126841	total: 87.9ms	remaining: 1.08s
15:	learn: 0.5094143	total: 93.3ms	remaining: 1.07s
16:	learn:

169:	learn: 0.3850230	total: 988ms	remaining: 174ms
170:	learn: 0.3844708	total: 993ms	remaining: 168ms
171:	learn: 0.3839859	total: 998ms	remaining: 162ms
172:	learn: 0.3836515	total: 1s	remaining: 157ms
173:	learn: 0.3831626	total: 1.01s	remaining: 151ms
174:	learn: 0.3825221	total: 1.01s	remaining: 145ms
175:	learn: 0.3821162	total: 1.02s	remaining: 139ms
176:	learn: 0.3815414	total: 1.02s	remaining: 133ms
177:	learn: 0.3809369	total: 1.03s	remaining: 127ms
178:	learn: 0.3804774	total: 1.03s	remaining: 122ms
179:	learn: 0.3799728	total: 1.04s	remaining: 116ms
180:	learn: 0.3796415	total: 1.05s	remaining: 110ms
181:	learn: 0.3791448	total: 1.05s	remaining: 104ms
182:	learn: 0.3784260	total: 1.06s	remaining: 98.5ms
183:	learn: 0.3778994	total: 1.07s	remaining: 92.8ms
184:	learn: 0.3775212	total: 1.07s	remaining: 87.1ms
185:	learn: 0.3771286	total: 1.08s	remaining: 81.3ms
186:	learn: 0.3766183	total: 1.09s	remaining: 75.5ms
187:	learn: 0.3760714	total: 1.09s	remaining: 69.7ms
188:	lear

💾 Checkpoint saved.
2025-07-09 14:43:14.360867 ✔ Completed LightGBM (raw) for 70s (took 0.0s)

2025-07-09 14:43:14.360867 ▶ Starting CatBoost (raw) for 70s [Strategy: 80/20]
0:	learn: 0.6589696	total: 7.52ms	remaining: 1.5s
1:	learn: 0.6327485	total: 14.8ms	remaining: 1.47s
2:	learn: 0.6121604	total: 22ms	remaining: 1.45s
3:	learn: 0.5956490	total: 27.7ms	remaining: 1.35s
4:	learn: 0.5800557	total: 33.7ms	remaining: 1.31s
5:	learn: 0.5694093	total: 40ms	remaining: 1.29s
6:	learn: 0.5593082	total: 45.8ms	remaining: 1.26s
7:	learn: 0.5505620	total: 51.8ms	remaining: 1.24s
8:	learn: 0.5417803	total: 58.1ms	remaining: 1.23s
9:	learn: 0.5351542	total: 64.4ms	remaining: 1.22s
10:	learn: 0.5299465	total: 69.9ms	remaining: 1.2s
11:	learn: 0.5248109	total: 75.5ms	remaining: 1.18s
12:	learn: 0.5201762	total: 81.7ms	remaining: 1.17s
13:	learn: 0.5157142	total: 87.6ms	remaining: 1.16s
14:	learn: 0.5117580	total: 92.9ms	remaining: 1.15s
15:	learn: 0.5088400	total: 97.8ms	remaining: 1.12s
16:	learn:

165:	learn: 0.3815240	total: 958ms	remaining: 196ms
166:	learn: 0.3806921	total: 964ms	remaining: 191ms
167:	learn: 0.3801347	total: 971ms	remaining: 185ms
168:	learn: 0.3796940	total: 977ms	remaining: 179ms
169:	learn: 0.3789696	total: 983ms	remaining: 174ms
170:	learn: 0.3784438	total: 990ms	remaining: 168ms
171:	learn: 0.3779721	total: 995ms	remaining: 162ms
172:	learn: 0.3774246	total: 1s	remaining: 156ms
173:	learn: 0.3767453	total: 1s	remaining: 150ms
174:	learn: 0.3761223	total: 1.01s	remaining: 144ms
175:	learn: 0.3754275	total: 1.01s	remaining: 138ms
176:	learn: 0.3749211	total: 1.02s	remaining: 133ms
177:	learn: 0.3745402	total: 1.02s	remaining: 127ms
178:	learn: 0.3741115	total: 1.03s	remaining: 121ms
179:	learn: 0.3734226	total: 1.03s	remaining: 115ms
180:	learn: 0.3727709	total: 1.04s	remaining: 109ms
181:	learn: 0.3721375	total: 1.05s	remaining: 103ms
182:	learn: 0.3717588	total: 1.05s	remaining: 97.8ms
183:	learn: 0.3713374	total: 1.06s	remaining: 92.1ms
184:	learn: 0.37

💾 Checkpoint saved.
2025-07-09 14:44:34.764355 ✔ Completed LightGBM (raw) for 80s (took 0.0s)

2025-07-09 14:44:34.764355 ▶ Starting CatBoost (raw) for 80s [Strategy: 80/20]
0:	learn: 0.6714080	total: 7.12ms	remaining: 1.42s
1:	learn: 0.6499452	total: 13.5ms	remaining: 1.33s
2:	learn: 0.6330730	total: 18.7ms	remaining: 1.23s
3:	learn: 0.6179654	total: 24.1ms	remaining: 1.18s
4:	learn: 0.6044197	total: 29.1ms	remaining: 1.14s
5:	learn: 0.5909117	total: 34.1ms	remaining: 1.1s
6:	learn: 0.5804610	total: 39.2ms	remaining: 1.08s
7:	learn: 0.5713197	total: 45.2ms	remaining: 1.08s
8:	learn: 0.5619365	total: 51.1ms	remaining: 1.08s
9:	learn: 0.5554648	total: 57.2ms	remaining: 1.09s
10:	learn: 0.5475248	total: 62.9ms	remaining: 1.08s
11:	learn: 0.5406640	total: 68.3ms	remaining: 1.07s
12:	learn: 0.5356838	total: 73.4ms	remaining: 1.05s
13:	learn: 0.5296247	total: 78.4ms	remaining: 1.04s
14:	learn: 0.5247810	total: 84.1ms	remaining: 1.04s
15:	learn: 0.5194852	total: 89.8ms	remaining: 1.03s
16:	l

184:	learn: 0.3851754	total: 976ms	remaining: 79.2ms
185:	learn: 0.3848152	total: 982ms	remaining: 73.9ms
186:	learn: 0.3844986	total: 988ms	remaining: 68.7ms
187:	learn: 0.3841653	total: 994ms	remaining: 63.4ms
188:	learn: 0.3838602	total: 999ms	remaining: 58.2ms
189:	learn: 0.3836332	total: 1s	remaining: 52.9ms
190:	learn: 0.3833657	total: 1.01s	remaining: 47.7ms
191:	learn: 0.3829667	total: 1.02s	remaining: 42.4ms
192:	learn: 0.3827505	total: 1.02s	remaining: 37.1ms
193:	learn: 0.3822916	total: 1.03s	remaining: 31.9ms
194:	learn: 0.3818630	total: 1.03s	remaining: 26.5ms
195:	learn: 0.3815349	total: 1.04s	remaining: 21.2ms
196:	learn: 0.3811697	total: 1.04s	remaining: 15.9ms
197:	learn: 0.3809024	total: 1.05s	remaining: 10.6ms
198:	learn: 0.3806967	total: 1.05s	remaining: 5.3ms
199:	learn: 0.3803844	total: 1.06s	remaining: 0us
2025-07-09 14:44:35.910147    • Using built-in feature_importances_ for CatBoost
💾 Checkpoint saved.
2025-07-09 14:44:36.158365 ✔ Completed CatBoost (raw) for 

💾 Checkpoint saved.
2025-07-09 14:45:33.386060 ✔ Completed LightGBM (raw) for 90s (took 0.0s)

2025-07-09 14:45:33.386060 ▶ Starting CatBoost (raw) for 90s [Strategy: 80/20]
0:	learn: 0.6608297	total: 8.13ms	remaining: 1.62s
1:	learn: 0.6307599	total: 13.8ms	remaining: 1.37s
2:	learn: 0.6049156	total: 19.7ms	remaining: 1.29s
3:	learn: 0.5800756	total: 25.1ms	remaining: 1.23s
4:	learn: 0.5606913	total: 29.6ms	remaining: 1.15s
5:	learn: 0.5435634	total: 35.2ms	remaining: 1.14s
6:	learn: 0.5295114	total: 39.5ms	remaining: 1.09s
7:	learn: 0.5148547	total: 44.6ms	remaining: 1.07s
8:	learn: 0.5026442	total: 50.3ms	remaining: 1.07s
9:	learn: 0.4905679	total: 55.9ms	remaining: 1.06s
10:	learn: 0.4806768	total: 60.4ms	remaining: 1.04s
11:	learn: 0.4713690	total: 64.6ms	remaining: 1.01s
12:	learn: 0.4638794	total: 70.2ms	remaining: 1.01s
13:	learn: 0.4554325	total: 75.7ms	remaining: 1s
14:	learn: 0.4480985	total: 80.5ms	remaining: 993ms
15:	learn: 0.4410085	total: 84.8ms	remaining: 976ms
16:	lea

160:	learn: 0.3056626	total: 779ms	remaining: 189ms
161:	learn: 0.3053776	total: 783ms	remaining: 184ms
162:	learn: 0.3051221	total: 787ms	remaining: 179ms
163:	learn: 0.3047736	total: 792ms	remaining: 174ms
164:	learn: 0.3043153	total: 796ms	remaining: 169ms
165:	learn: 0.3040458	total: 800ms	remaining: 164ms
166:	learn: 0.3038667	total: 804ms	remaining: 159ms
167:	learn: 0.3036220	total: 808ms	remaining: 154ms
168:	learn: 0.3032981	total: 812ms	remaining: 149ms
169:	learn: 0.3027828	total: 818ms	remaining: 144ms
170:	learn: 0.3026208	total: 823ms	remaining: 140ms
171:	learn: 0.3022940	total: 828ms	remaining: 135ms
172:	learn: 0.3017119	total: 833ms	remaining: 130ms
173:	learn: 0.3014529	total: 837ms	remaining: 125ms
174:	learn: 0.3012144	total: 843ms	remaining: 120ms
175:	learn: 0.3008953	total: 848ms	remaining: 116ms
176:	learn: 0.3006480	total: 854ms	remaining: 111ms
177:	learn: 0.3002704	total: 859ms	remaining: 106ms
178:	learn: 0.2999255	total: 864ms	remaining: 101ms
179:	learn: 

💾 Checkpoint saved.
2025-07-09 14:46:36.225741 ✔ Completed LightGBM (raw) for 00s (took 0.0s)

2025-07-09 14:46:36.225741 ▶ Starting CatBoost (raw) for 00s [Strategy: 80/20]
0:	learn: 0.6529041	total: 3.82ms	remaining: 378ms
1:	learn: 0.6209582	total: 7.19ms	remaining: 352ms
2:	learn: 0.5909076	total: 11.7ms	remaining: 377ms
3:	learn: 0.5665622	total: 14.5ms	remaining: 347ms
4:	learn: 0.5467533	total: 17.1ms	remaining: 325ms
5:	learn: 0.5285353	total: 20ms	remaining: 313ms
6:	learn: 0.5120776	total: 23.3ms	remaining: 310ms
7:	learn: 0.4980901	total: 27.4ms	remaining: 315ms
8:	learn: 0.4852473	total: 30.9ms	remaining: 313ms
9:	learn: 0.4721851	total: 34.5ms	remaining: 310ms
10:	learn: 0.4611056	total: 37.7ms	remaining: 305ms
11:	learn: 0.4522464	total: 41.3ms	remaining: 303ms
12:	learn: 0.4430992	total: 44.5ms	remaining: 298ms
13:	learn: 0.4354935	total: 47.9ms	remaining: 294ms
14:	learn: 0.4292484	total: 51.3ms	remaining: 291ms
15:	learn: 0.4241610	total: 54.6ms	remaining: 287ms
16:	le

💾 Checkpoint saved.
2025-07-09 14:47:46.669514 ✔ Completed XGBoost (raw) for 10s (took 0.0s)

2025-07-09 14:47:46.669514 ▶ Starting LightGBM (raw) for 10s [Strategy: 80/20]
[LightGBM] [Info] Number of positive: 3197, number of negative: 3199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2854
[LightGBM] [Info] Number of data points in the train set: 6396, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499844 -> initscore=-0.000625
[LightGBM] [Info] Start training from score -0.000625
2025-07-09 14:47:46.837915    • Using built-in feature_importances_ for LightGBM


💾 Checkpoint saved.
2025-07-09 14:47:47.119678 ✔ Completed LightGBM (raw) for 10s (took 0.0s)

2025-07-09 14:47:47.119678 ▶ Starting CatBoost (raw) for 10s [Strategy: 80/20]
0:	learn: 0.6406880	total: 14.9ms	remaining: 2.96s
1:	learn: 0.5926509	total: 19.9ms	remaining: 1.97s
2:	learn: 0.5533621	total: 25.5ms	remaining: 1.68s
3:	learn: 0.5255492	total: 31.4ms	remaining: 1.54s
4:	learn: 0.5040522	total: 35.7ms	remaining: 1.39s
5:	learn: 0.4876672	total: 41.4ms	remaining: 1.34s
6:	learn: 0.4698106	total: 46.7ms	remaining: 1.29s
7:	learn: 0.4554427	total: 51.5ms	remaining: 1.24s
8:	learn: 0.4428186	total: 56.6ms	remaining: 1.2s
9:	learn: 0.4343138	total: 61.2ms	remaining: 1.16s
10:	learn: 0.4261379	total: 65.3ms	remaining: 1.12s
11:	learn: 0.4205984	total: 69.9ms	remaining: 1.09s
12:	learn: 0.4155794	total: 75.5ms	remaining: 1.09s
13:	learn: 0.4104982	total: 80.6ms	remaining: 1.07s
14:	learn: 0.4059174	total: 86.2ms	remaining: 1.06s
15:	learn: 0.4023441	total: 90.5ms	remaining: 1.04s
16:	l

168:	learn: 0.2658735	total: 798ms	remaining: 146ms
169:	learn: 0.2653849	total: 803ms	remaining: 142ms
170:	learn: 0.2649245	total: 808ms	remaining: 137ms
171:	learn: 0.2644190	total: 812ms	remaining: 132ms
172:	learn: 0.2639448	total: 818ms	remaining: 128ms
173:	learn: 0.2633924	total: 823ms	remaining: 123ms
174:	learn: 0.2630166	total: 828ms	remaining: 118ms
175:	learn: 0.2624843	total: 832ms	remaining: 113ms
176:	learn: 0.2618271	total: 837ms	remaining: 109ms
177:	learn: 0.2613830	total: 841ms	remaining: 104ms
178:	learn: 0.2605704	total: 846ms	remaining: 99.3ms
179:	learn: 0.2599447	total: 850ms	remaining: 94.5ms
180:	learn: 0.2594553	total: 855ms	remaining: 89.8ms
181:	learn: 0.2589447	total: 861ms	remaining: 85.1ms
182:	learn: 0.2583391	total: 866ms	remaining: 80.5ms
183:	learn: 0.2578888	total: 870ms	remaining: 75.7ms
184:	learn: 0.2575422	total: 875ms	remaining: 70.9ms
185:	learn: 0.2571744	total: 879ms	remaining: 66.2ms
186:	learn: 0.2565761	total: 885ms	remaining: 61.5ms
187

2025-07-09 14:49:41.915093    • Using built-in feature_importances_ for LightGBM
💾 Checkpoint saved.
2025-07-09 14:49:42.174948 ✔ Completed LightGBM (raw) for 60s (took 0.0s)

2025-07-09 14:49:42.174948 ▶ Starting CatBoost (raw) for 60s [Strategy: CV5]
0:	learn: 0.6601379	total: 7.15ms	remaining: 1.42s
1:	learn: 0.6332726	total: 13.1ms	remaining: 1.3s
2:	learn: 0.6125302	total: 19.3ms	remaining: 1.26s
3:	learn: 0.5936747	total: 24.9ms	remaining: 1.22s
4:	learn: 0.5784559	total: 31.1ms	remaining: 1.21s
5:	learn: 0.5649885	total: 37.5ms	remaining: 1.21s
6:	learn: 0.5559960	total: 43.4ms	remaining: 1.2s
7:	learn: 0.5473908	total: 49.9ms	remaining: 1.2s
8:	learn: 0.5404374	total: 56.5ms	remaining: 1.2s
9:	learn: 0.5342858	total: 62.2ms	remaining: 1.18s
10:	learn: 0.5294068	total: 67.7ms	remaining: 1.16s
11:	learn: 0.5246463	total: 73ms	remaining: 1.14s
12:	learn: 0.5200977	total: 78.9ms	remaining: 1.13s
13:	learn: 0.5159464	total: 84.1ms	remaining: 1.12s
14:	learn: 0.5126841	total: 90.2ms	

178:	learn: 0.3804774	total: 974ms	remaining: 114ms
179:	learn: 0.3799728	total: 980ms	remaining: 109ms
180:	learn: 0.3796415	total: 985ms	remaining: 103ms
181:	learn: 0.3791448	total: 990ms	remaining: 97.9ms
182:	learn: 0.3784260	total: 995ms	remaining: 92.4ms
183:	learn: 0.3778994	total: 1000ms	remaining: 86.9ms
184:	learn: 0.3775212	total: 1s	remaining: 81.5ms
185:	learn: 0.3771286	total: 1.01s	remaining: 76.1ms
186:	learn: 0.3766183	total: 1.02s	remaining: 70.7ms
187:	learn: 0.3760714	total: 1.02s	remaining: 65.3ms
188:	learn: 0.3755399	total: 1.03s	remaining: 59.8ms
189:	learn: 0.3749968	total: 1.03s	remaining: 54.3ms
190:	learn: 0.3745463	total: 1.04s	remaining: 48.9ms
191:	learn: 0.3739486	total: 1.04s	remaining: 43.4ms
192:	learn: 0.3735286	total: 1.05s	remaining: 38ms
193:	learn: 0.3730717	total: 1.05s	remaining: 32.6ms
194:	learn: 0.3726385	total: 1.06s	remaining: 27.2ms
195:	learn: 0.3719832	total: 1.06s	remaining: 21.7ms
196:	learn: 0.3716175	total: 1.07s	remaining: 16.3ms


💾 Checkpoint saved.
2025-07-09 14:51:12.329403 ✔ Completed LightGBM (raw) for 70s (took 0.0s)

2025-07-09 14:51:12.329403 ▶ Starting CatBoost (raw) for 70s [Strategy: CV5]
0:	learn: 0.6753769	total: 5.77ms	remaining: 1.15s
1:	learn: 0.6601356	total: 10.8ms	remaining: 1.07s
2:	learn: 0.6464859	total: 16.7ms	remaining: 1.09s
3:	learn: 0.6349473	total: 22.3ms	remaining: 1.09s
4:	learn: 0.6233318	total: 26.9ms	remaining: 1.05s
5:	learn: 0.6145778	total: 31.4ms	remaining: 1.01s
6:	learn: 0.6051587	total: 37.2ms	remaining: 1.02s
7:	learn: 0.5959372	total: 42ms	remaining: 1.01s
8:	learn: 0.5888400	total: 47.9ms	remaining: 1.02s
9:	learn: 0.5818583	total: 53.7ms	remaining: 1.02s
10:	learn: 0.5754928	total: 58.4ms	remaining: 1s
11:	learn: 0.5698585	total: 63ms	remaining: 987ms
12:	learn: 0.5644427	total: 67.9ms	remaining: 976ms
13:	learn: 0.5588062	total: 73.1ms	remaining: 971ms
14:	learn: 0.5536516	total: 79.4ms	remaining: 979ms
15:	learn: 0.5491555	total: 86ms	remaining: 989ms
16:	learn: 0.54

181:	learn: 0.4301870	total: 970ms	remaining: 96ms
182:	learn: 0.4298420	total: 974ms	remaining: 90.5ms
183:	learn: 0.4295872	total: 981ms	remaining: 85.3ms
184:	learn: 0.4293678	total: 985ms	remaining: 79.9ms
185:	learn: 0.4288501	total: 992ms	remaining: 74.7ms
186:	learn: 0.4284846	total: 997ms	remaining: 69.3ms
187:	learn: 0.4280111	total: 1s	remaining: 63.9ms
188:	learn: 0.4277790	total: 1.01s	remaining: 58.6ms
189:	learn: 0.4275551	total: 1.01s	remaining: 53.2ms
190:	learn: 0.4273323	total: 1.02s	remaining: 47.9ms
191:	learn: 0.4268234	total: 1.02s	remaining: 42.6ms
192:	learn: 0.4264787	total: 1.03s	remaining: 37.3ms
193:	learn: 0.4261225	total: 1.03s	remaining: 31.9ms
194:	learn: 0.4257550	total: 1.04s	remaining: 26.6ms
195:	learn: 0.4254400	total: 1.04s	remaining: 21.3ms
196:	learn: 0.4250566	total: 1.05s	remaining: 16ms
197:	learn: 0.4247027	total: 1.05s	remaining: 10.6ms
198:	learn: 0.4243829	total: 1.06s	remaining: 5.32ms
199:	learn: 0.4241059	total: 1.06s	remaining: 0us
202

💾 Checkpoint saved.
2025-07-09 14:52:38.047997 ✔ Completed LightGBM (raw) for 80s (took 0.0s)

2025-07-09 14:52:38.047997 ▶ Starting CatBoost (raw) for 80s [Strategy: CV5]
0:	learn: 0.6515681	total: 5.28ms	remaining: 1.05s
1:	learn: 0.6150075	total: 9.57ms	remaining: 948ms
2:	learn: 0.5905486	total: 14.6ms	remaining: 958ms
3:	learn: 0.5673251	total: 20.6ms	remaining: 1.01s
4:	learn: 0.5513991	total: 27ms	remaining: 1.05s
5:	learn: 0.5388097	total: 32.3ms	remaining: 1.04s
6:	learn: 0.5268624	total: 38.1ms	remaining: 1.05s
7:	learn: 0.5178134	total: 42.9ms	remaining: 1.03s
8:	learn: 0.5086848	total: 48.5ms	remaining: 1.03s
9:	learn: 0.5022213	total: 53.7ms	remaining: 1.02s
10:	learn: 0.4948445	total: 58.3ms	remaining: 1s
11:	learn: 0.4895657	total: 63.1ms	remaining: 988ms
12:	learn: 0.4848038	total: 67.7ms	remaining: 973ms
13:	learn: 0.4804124	total: 74.1ms	remaining: 984ms
14:	learn: 0.4772884	total: 79ms	remaining: 974ms
15:	learn: 0.4735255	total: 84.7ms	remaining: 974ms
16:	learn: 0.

174:	learn: 0.3317746	total: 964ms	remaining: 138ms
175:	learn: 0.3313249	total: 969ms	remaining: 132ms
176:	learn: 0.3309443	total: 975ms	remaining: 127ms
177:	learn: 0.3305703	total: 980ms	remaining: 121ms
178:	learn: 0.3300662	total: 986ms	remaining: 116ms
179:	learn: 0.3295780	total: 992ms	remaining: 110ms
180:	learn: 0.3292131	total: 997ms	remaining: 105ms
181:	learn: 0.3286437	total: 1s	remaining: 99.2ms
182:	learn: 0.3280712	total: 1.02s	remaining: 95.1ms
183:	learn: 0.3276584	total: 1.03s	remaining: 89.4ms
184:	learn: 0.3273563	total: 1.03s	remaining: 83.8ms
185:	learn: 0.3267568	total: 1.04s	remaining: 78.1ms
186:	learn: 0.3262571	total: 1.04s	remaining: 72.5ms
187:	learn: 0.3256867	total: 1.05s	remaining: 66.8ms
188:	learn: 0.3248911	total: 1.05s	remaining: 61.2ms
189:	learn: 0.3244311	total: 1.06s	remaining: 55.6ms
190:	learn: 0.3238441	total: 1.06s	remaining: 50ms
191:	learn: 0.3231907	total: 1.07s	remaining: 44.4ms
192:	learn: 0.3226501	total: 1.07s	remaining: 38.8ms
193:	

💾 Checkpoint saved.
2025-07-09 14:53:32.282442 ✔ Completed LightGBM (raw) for 90s (took 0.0s)

2025-07-09 14:53:32.282442 ▶ Starting CatBoost (raw) for 90s [Strategy: CV5]
0:	learn: 0.6608297	total: 29.2ms	remaining: 5.8s
1:	learn: 0.6307599	total: 33.4ms	remaining: 3.31s
2:	learn: 0.6049156	total: 38.9ms	remaining: 2.55s
3:	learn: 0.5800756	total: 44.2ms	remaining: 2.17s
4:	learn: 0.5606913	total: 49.7ms	remaining: 1.94s
5:	learn: 0.5435634	total: 54.4ms	remaining: 1.76s
6:	learn: 0.5295114	total: 57.9ms	remaining: 1.59s
7:	learn: 0.5148547	total: 62.7ms	remaining: 1.5s
8:	learn: 0.5026442	total: 66.6ms	remaining: 1.41s
9:	learn: 0.4905679	total: 71.2ms	remaining: 1.35s
10:	learn: 0.4806768	total: 75.8ms	remaining: 1.3s
11:	learn: 0.4713690	total: 79.8ms	remaining: 1.25s
12:	learn: 0.4638794	total: 83.8ms	remaining: 1.21s
13:	learn: 0.4554325	total: 88ms	remaining: 1.17s
14:	learn: 0.4480985	total: 92.2ms	remaining: 1.14s
15:	learn: 0.4410085	total: 97.1ms	remaining: 1.12s
16:	learn: 

194:	learn: 0.2941441	total: 965ms	remaining: 24.7ms
195:	learn: 0.2937607	total: 970ms	remaining: 19.8ms
196:	learn: 0.2935002	total: 975ms	remaining: 14.8ms
197:	learn: 0.2931689	total: 979ms	remaining: 9.88ms
198:	learn: 0.2928616	total: 984ms	remaining: 4.95ms
199:	learn: 0.2925887	total: 988ms	remaining: 0us
2025-07-09 14:53:33.363962    • Using built-in feature_importances_ for CatBoost
💾 Checkpoint saved.
2025-07-09 14:53:33.665932 ✔ Completed CatBoost (raw) for 90s (took 0.0s)

2025-07-09 14:53:33.673361 ▶ Starting RandomForest (raw) for 00s [Strategy: CV5]
2025-07-09 14:53:36.655929    • Using built-in feature_importances_ for RandomForest
💾 Checkpoint saved.
2025-07-09 14:53:36.962290 ✔ Completed RandomForest (raw) for 00s (took 0.0s)

2025-07-09 14:53:36.962290 ▶ Starting ExtraTrees (raw) for 00s [Strategy: CV5]
2025-07-09 14:53:38.552420    • Using built-in feature_importances_ for ExtraTrees
💾 Checkpoint saved.
2025-07-09 14:53:39.090804 ✔ Completed ExtraTrees (raw) for 00

💾 Checkpoint saved.
2025-07-09 14:54:30.698042 ✔ Completed LightGBM (raw) for 00s (took 0.0s)

2025-07-09 14:54:30.698042 ▶ Starting CatBoost (raw) for 00s [Strategy: CV5]
0:	learn: 0.6596351	total: 6.98ms	remaining: 1.39s
1:	learn: 0.6326966	total: 13.1ms	remaining: 1.3s
2:	learn: 0.6076119	total: 19.1ms	remaining: 1.25s
3:	learn: 0.5845933	total: 25.2ms	remaining: 1.23s
4:	learn: 0.5631776	total: 30.6ms	remaining: 1.2s
5:	learn: 0.5453357	total: 36.5ms	remaining: 1.18s
6:	learn: 0.5287642	total: 42.4ms	remaining: 1.17s
7:	learn: 0.5166055	total: 47.4ms	remaining: 1.14s
8:	learn: 0.5036855	total: 53.9ms	remaining: 1.14s
9:	learn: 0.4931987	total: 59.3ms	remaining: 1.13s
10:	learn: 0.4827881	total: 63.7ms	remaining: 1.09s
11:	learn: 0.4728037	total: 68.3ms	remaining: 1.07s
12:	learn: 0.4639648	total: 72.8ms	remaining: 1.05s
13:	learn: 0.4564013	total: 77.2ms	remaining: 1.02s
14:	learn: 0.4496395	total: 82.4ms	remaining: 1.02s
15:	learn: 0.4434278	total: 87.8ms	remaining: 1.01s
16:	lear

195:	learn: 0.2849270	total: 961ms	remaining: 19.6ms
196:	learn: 0.2844653	total: 965ms	remaining: 14.7ms
197:	learn: 0.2840347	total: 969ms	remaining: 9.79ms
198:	learn: 0.2836896	total: 974ms	remaining: 4.89ms
199:	learn: 0.2833700	total: 979ms	remaining: 0us
2025-07-09 14:54:31.763937    • Using built-in feature_importances_ for CatBoost
💾 Checkpoint saved.
2025-07-09 14:54:32.074053 ✔ Completed CatBoost (raw) for 00s (took 0.0s)

2025-07-09 14:54:32.082969 ▶ Starting RandomForest (raw) for 10s [Strategy: CV5]
2025-07-09 14:54:35.182793    • Using built-in feature_importances_ for RandomForest
💾 Checkpoint saved.
2025-07-09 14:54:35.513782 ✔ Completed RandomForest (raw) for 10s (took 0.0s)

2025-07-09 14:54:35.513782 ▶ Starting ExtraTrees (raw) for 10s [Strategy: CV5]
2025-07-09 14:54:37.730956    • Using built-in feature_importances_ for ExtraTrees
💾 Checkpoint saved.
2025-07-09 14:54:38.077815 ✔ Completed ExtraTrees (raw) for 10s (took 0.1s)

2025-07-09 14:54:38.077815 ▶ Starting 

💾 Checkpoint saved.
2025-07-09 14:55:31.361816 ✔ Completed LightGBM (raw) for 10s (took 0.0s)

2025-07-09 14:55:31.361816 ▶ Starting CatBoost (raw) for 10s [Strategy: CV5]
0:	learn: 0.6659181	total: 7.13ms	remaining: 1.42s
1:	learn: 0.6349384	total: 12.8ms	remaining: 1.26s
2:	learn: 0.6097384	total: 18.9ms	remaining: 1.24s
3:	learn: 0.5893985	total: 25ms	remaining: 1.22s
4:	learn: 0.5733413	total: 29.9ms	remaining: 1.17s
5:	learn: 0.5567765	total: 34.7ms	remaining: 1.12s
6:	learn: 0.5400394	total: 39.2ms	remaining: 1.08s
7:	learn: 0.5260598	total: 44.4ms	remaining: 1.06s
8:	learn: 0.5134171	total: 48.9ms	remaining: 1.04s
9:	learn: 0.5032709	total: 53.4ms	remaining: 1.01s
10:	learn: 0.4941083	total: 57.7ms	remaining: 992ms
11:	learn: 0.4861387	total: 61.9ms	remaining: 970ms
12:	learn: 0.4769323	total: 66.5ms	remaining: 956ms
13:	learn: 0.4699920	total: 70.9ms	remaining: 942ms
14:	learn: 0.4645830	total: 75.3ms	remaining: 928ms
15:	learn: 0.4584101	total: 79.6ms	remaining: 915ms
16:	lear

158:	learn: 0.3207158	total: 790ms	remaining: 204ms
159:	learn: 0.3203929	total: 795ms	remaining: 199ms
160:	learn: 0.3201154	total: 801ms	remaining: 194ms
161:	learn: 0.3197531	total: 806ms	remaining: 189ms
162:	learn: 0.3193324	total: 812ms	remaining: 184ms
163:	learn: 0.3188514	total: 817ms	remaining: 179ms
164:	learn: 0.3185089	total: 823ms	remaining: 175ms
165:	learn: 0.3181954	total: 829ms	remaining: 170ms
166:	learn: 0.3178905	total: 833ms	remaining: 165ms
167:	learn: 0.3175675	total: 838ms	remaining: 160ms
168:	learn: 0.3171919	total: 842ms	remaining: 155ms
169:	learn: 0.3167021	total: 846ms	remaining: 149ms
170:	learn: 0.3163962	total: 851ms	remaining: 144ms
171:	learn: 0.3160170	total: 855ms	remaining: 139ms
172:	learn: 0.3155356	total: 860ms	remaining: 134ms
173:	learn: 0.3153376	total: 865ms	remaining: 129ms
174:	learn: 0.3149773	total: 869ms	remaining: 124ms
175:	learn: 0.3147025	total: 874ms	remaining: 119ms
176:	learn: 0.3143427	total: 878ms	remaining: 114ms
177:	learn: 

💾 Checkpoint saved.
2025-07-09 14:57:23.464168 ✔ Completed LightGBM (raw) for 60s (took 0.0s)

2025-07-09 14:57:23.464168 ▶ Starting CatBoost (raw) for 60s [Strategy: CV10]
0:	learn: 0.6601379	total: 18.9ms	remaining: 3.76s
1:	learn: 0.6332726	total: 24.1ms	remaining: 2.38s
2:	learn: 0.6125302	total: 30.1ms	remaining: 1.97s
3:	learn: 0.5936747	total: 35.5ms	remaining: 1.74s
4:	learn: 0.5784559	total: 41.6ms	remaining: 1.62s
5:	learn: 0.5649885	total: 47.3ms	remaining: 1.53s
6:	learn: 0.5559960	total: 53.2ms	remaining: 1.47s
7:	learn: 0.5473908	total: 58.4ms	remaining: 1.4s
8:	learn: 0.5404374	total: 63.6ms	remaining: 1.35s
9:	learn: 0.5342858	total: 69.9ms	remaining: 1.33s
10:	learn: 0.5294068	total: 76.1ms	remaining: 1.31s
11:	learn: 0.5246463	total: 82.1ms	remaining: 1.28s
12:	learn: 0.5200977	total: 88ms	remaining: 1.26s
13:	learn: 0.5159464	total: 93.5ms	remaining: 1.24s
14:	learn: 0.5126841	total: 100ms	remaining: 1.23s
15:	learn: 0.5094143	total: 106ms	remaining: 1.22s
16:	learn:

168:	learn: 0.3855405	total: 965ms	remaining: 177ms
169:	learn: 0.3850230	total: 969ms	remaining: 171ms
170:	learn: 0.3844708	total: 976ms	remaining: 165ms
171:	learn: 0.3839859	total: 982ms	remaining: 160ms
172:	learn: 0.3836515	total: 988ms	remaining: 154ms
173:	learn: 0.3831626	total: 995ms	remaining: 149ms
174:	learn: 0.3825221	total: 1s	remaining: 143ms
175:	learn: 0.3821162	total: 1.01s	remaining: 137ms
176:	learn: 0.3815414	total: 1.01s	remaining: 131ms
177:	learn: 0.3809369	total: 1.02s	remaining: 126ms
178:	learn: 0.3804774	total: 1.02s	remaining: 120ms
179:	learn: 0.3799728	total: 1.03s	remaining: 114ms
180:	learn: 0.3796415	total: 1.03s	remaining: 108ms
181:	learn: 0.3791448	total: 1.04s	remaining: 103ms
182:	learn: 0.3784260	total: 1.04s	remaining: 96.9ms
183:	learn: 0.3778994	total: 1.05s	remaining: 91.2ms
184:	learn: 0.3775212	total: 1.05s	remaining: 85.4ms
185:	learn: 0.3771286	total: 1.06s	remaining: 79.7ms
186:	learn: 0.3766183	total: 1.06s	remaining: 74ms
187:	learn: 

💾 Checkpoint saved.
2025-07-09 14:59:04.493502 ✔ Completed LightGBM (raw) for 70s (took 0.0s)

2025-07-09 14:59:04.493502 ▶ Starting CatBoost (raw) for 70s [Strategy: CV10]
0:	learn: 0.6589696	total: 8.55ms	remaining: 1.7s
1:	learn: 0.6327485	total: 13.9ms	remaining: 1.38s
2:	learn: 0.6121604	total: 19ms	remaining: 1.25s
3:	learn: 0.5956490	total: 24.5ms	remaining: 1.2s
4:	learn: 0.5800557	total: 30.1ms	remaining: 1.17s
5:	learn: 0.5694093	total: 34.8ms	remaining: 1.13s
6:	learn: 0.5593082	total: 40.3ms	remaining: 1.11s
7:	learn: 0.5505620	total: 45.1ms	remaining: 1.08s
8:	learn: 0.5417803	total: 50.2ms	remaining: 1.06s
9:	learn: 0.5351542	total: 55.1ms	remaining: 1.05s
10:	learn: 0.5299465	total: 60.1ms	remaining: 1.03s
11:	learn: 0.5248109	total: 66ms	remaining: 1.03s
12:	learn: 0.5201762	total: 72ms	remaining: 1.03s
13:	learn: 0.5157142	total: 77.5ms	remaining: 1.03s
14:	learn: 0.5117580	total: 83.5ms	remaining: 1.03s
15:	learn: 0.5088400	total: 89.7ms	remaining: 1.03s
16:	learn: 0.

174:	learn: 0.3761223	total: 969ms	remaining: 138ms
175:	learn: 0.3754275	total: 975ms	remaining: 133ms
176:	learn: 0.3749211	total: 980ms	remaining: 127ms
177:	learn: 0.3745402	total: 985ms	remaining: 122ms
178:	learn: 0.3741115	total: 990ms	remaining: 116ms
179:	learn: 0.3734226	total: 994ms	remaining: 110ms
180:	learn: 0.3727709	total: 999ms	remaining: 105ms
181:	learn: 0.3721375	total: 1s	remaining: 99.3ms
182:	learn: 0.3717588	total: 1.01s	remaining: 93.7ms
183:	learn: 0.3713374	total: 1.01s	remaining: 88.2ms
184:	learn: 0.3708698	total: 1.02s	remaining: 82.6ms
185:	learn: 0.3702205	total: 1.02s	remaining: 77.1ms
186:	learn: 0.3697777	total: 1.03s	remaining: 71.5ms
187:	learn: 0.3691617	total: 1.03s	remaining: 66ms
188:	learn: 0.3685917	total: 1.04s	remaining: 60.5ms
189:	learn: 0.3682010	total: 1.04s	remaining: 55ms
190:	learn: 0.3675387	total: 1.05s	remaining: 49.5ms
191:	learn: 0.3670736	total: 1.06s	remaining: 44ms
192:	learn: 0.3664929	total: 1.06s	remaining: 38.5ms
193:	lear

💾 Checkpoint saved.
2025-07-09 15:00:25.528702 ✔ Completed LightGBM (raw) for 80s (took 0.0s)

2025-07-09 15:00:25.528702 ▶ Starting CatBoost (raw) for 80s [Strategy: CV10]
0:	learn: 0.6575415	total: 4.23ms	remaining: 842ms
1:	learn: 0.6332170	total: 9.05ms	remaining: 896ms
2:	learn: 0.6114710	total: 13.7ms	remaining: 899ms
3:	learn: 0.5957087	total: 17.8ms	remaining: 872ms
4:	learn: 0.5811073	total: 22.4ms	remaining: 876ms
5:	learn: 0.5700199	total: 27ms	remaining: 875ms
6:	learn: 0.5602376	total: 30ms	remaining: 827ms
7:	learn: 0.5509140	total: 33.6ms	remaining: 805ms
8:	learn: 0.5416350	total: 37ms	remaining: 785ms
9:	learn: 0.5339259	total: 40.6ms	remaining: 771ms
10:	learn: 0.5266463	total: 44.3ms	remaining: 762ms
11:	learn: 0.5214015	total: 48.1ms	remaining: 754ms
12:	learn: 0.5163900	total: 52ms	remaining: 749ms
13:	learn: 0.5134461	total: 56.8ms	remaining: 754ms
14:	learn: 0.5088769	total: 61.3ms	remaining: 756ms
15:	learn: 0.5047838	total: 64.3ms	remaining: 739ms
16:	learn: 0.

190:	learn: 0.3874158	total: 753ms	remaining: 35.5ms
191:	learn: 0.3871308	total: 758ms	remaining: 31.6ms
192:	learn: 0.3868770	total: 762ms	remaining: 27.6ms
193:	learn: 0.3867234	total: 766ms	remaining: 23.7ms
194:	learn: 0.3864578	total: 771ms	remaining: 19.8ms
195:	learn: 0.3860532	total: 775ms	remaining: 15.8ms
196:	learn: 0.3858603	total: 779ms	remaining: 11.9ms
197:	learn: 0.3855467	total: 783ms	remaining: 7.91ms
198:	learn: 0.3853302	total: 787ms	remaining: 3.96ms
199:	learn: 0.3850621	total: 791ms	remaining: 0us
2025-07-09 15:00:26.403073    • Using built-in feature_importances_ for CatBoost
💾 Checkpoint saved.
2025-07-09 15:00:26.724906 ✔ Completed CatBoost (raw) for 80s (took 0.0s)

2025-07-09 15:00:26.736967 ▶ Starting RandomForest (raw) for 90s [Strategy: CV10]
2025-07-09 15:00:30.464853    • Using built-in feature_importances_ for RandomForest
💾 Checkpoint saved.
2025-07-09 15:00:30.847906 ✔ Completed RandomForest (raw) for 90s (took 0.1s)

2025-07-09 15:00:30.847906 ▶ St

💾 Checkpoint saved.
2025-07-09 15:01:21.136407 ✔ Completed LightGBM (raw) for 90s (took 0.0s)

2025-07-09 15:01:21.136407 ▶ Starting CatBoost (raw) for 90s [Strategy: CV10]
0:	learn: 0.6413689	total: 4.04ms	remaining: 803ms
1:	learn: 0.6022099	total: 8.49ms	remaining: 841ms
2:	learn: 0.5721209	total: 14.3ms	remaining: 942ms
3:	learn: 0.5451338	total: 18.1ms	remaining: 888ms
4:	learn: 0.5281341	total: 21.2ms	remaining: 826ms
5:	learn: 0.5091635	total: 24.4ms	remaining: 790ms
6:	learn: 0.4940948	total: 27.9ms	remaining: 770ms
7:	learn: 0.4806187	total: 31.8ms	remaining: 763ms
8:	learn: 0.4692237	total: 35.2ms	remaining: 748ms
9:	learn: 0.4588731	total: 38.5ms	remaining: 731ms
10:	learn: 0.4499770	total: 42.6ms	remaining: 732ms
11:	learn: 0.4418435	total: 46.3ms	remaining: 725ms
12:	learn: 0.4355243	total: 49.7ms	remaining: 715ms
13:	learn: 0.4289861	total: 53.2ms	remaining: 707ms
14:	learn: 0.4238799	total: 56.9ms	remaining: 701ms
15:	learn: 0.4195975	total: 60ms	remaining: 690ms
16:	lea

💾 Checkpoint saved.
2025-07-09 15:01:22.296078 ✔ Completed CatBoost (raw) for 90s (took 0.0s)

2025-07-09 15:01:22.305339 ▶ Starting RandomForest (raw) for 00s [Strategy: CV10]
2025-07-09 15:01:28.138715    • Using built-in feature_importances_ for RandomForest
💾 Checkpoint saved.
2025-07-09 15:01:28.525410 ✔ Completed RandomForest (raw) for 00s (took 0.1s)

2025-07-09 15:01:28.525410 ▶ Starting ExtraTrees (raw) for 00s [Strategy: CV10]
2025-07-09 15:01:30.557709    • Using built-in feature_importances_ for ExtraTrees
💾 Checkpoint saved.
2025-07-09 15:01:30.950193 ✔ Completed ExtraTrees (raw) for 00s (took 0.1s)

2025-07-09 15:01:30.950193 ▶ Starting GradientBoosting (raw) for 00s [Strategy: CV10]
2025-07-09 15:01:35.185707    • Using built-in feature_importances_ for GradientBoosting
💾 Checkpoint saved.
2025-07-09 15:01:35.730960 ✔ Completed GradientBoosting (raw) for 00s (took 0.0s)

2025-07-09 15:01:35.730960 ▶ Starting HistGradientBoosting (raw) for 00s [Strategy: CV10]
2025-07-09 

💾 Checkpoint saved.
2025-07-09 15:02:15.081805 ✔ Completed LightGBM (raw) for 00s (took 0.0s)

2025-07-09 15:02:15.081805 ▶ Starting CatBoost (raw) for 00s [Strategy: CV10]
0:	learn: 0.6596351	total: 7.57ms	remaining: 1.51s
1:	learn: 0.6326966	total: 14.1ms	remaining: 1.39s
2:	learn: 0.6076119	total: 18.8ms	remaining: 1.24s
3:	learn: 0.5845933	total: 24.3ms	remaining: 1.19s
4:	learn: 0.5631776	total: 30.3ms	remaining: 1.18s
5:	learn: 0.5453357	total: 35.2ms	remaining: 1.14s
6:	learn: 0.5287642	total: 40.8ms	remaining: 1.13s
7:	learn: 0.5166055	total: 46ms	remaining: 1.1s
8:	learn: 0.5036855	total: 50.5ms	remaining: 1.07s
9:	learn: 0.4931987	total: 55.1ms	remaining: 1.05s
10:	learn: 0.4827881	total: 60.2ms	remaining: 1.03s
11:	learn: 0.4728037	total: 65.6ms	remaining: 1.03s
12:	learn: 0.4639648	total: 70.9ms	remaining: 1.02s
13:	learn: 0.4564013	total: 75.8ms	remaining: 1.01s
14:	learn: 0.4496395	total: 80.8ms	remaining: 997ms
15:	learn: 0.4434278	total: 85.2ms	remaining: 980ms
16:	lear

197:	learn: 0.2840347	total: 952ms	remaining: 9.61ms
198:	learn: 0.2836896	total: 957ms	remaining: 4.81ms
199:	learn: 0.2833700	total: 962ms	remaining: 0us
2025-07-09 15:02:16.136288    • Using built-in feature_importances_ for CatBoost
💾 Checkpoint saved.
2025-07-09 15:02:16.494174 ✔ Completed CatBoost (raw) for 00s (took 0.0s)

2025-07-09 15:02:16.503170 ▶ Starting RandomForest (raw) for 10s [Strategy: CV10]
2025-07-09 15:02:22.920080    • Using built-in feature_importances_ for RandomForest
💾 Checkpoint saved.
2025-07-09 15:02:23.328263 ✔ Completed RandomForest (raw) for 10s (took 0.1s)

2025-07-09 15:02:23.328263 ▶ Starting ExtraTrees (raw) for 10s [Strategy: CV10]
2025-07-09 15:02:25.831867    • Using built-in feature_importances_ for ExtraTrees
💾 Checkpoint saved.
2025-07-09 15:02:26.429167 ✔ Completed ExtraTrees (raw) for 10s (took 0.1s)

2025-07-09 15:02:26.429167 ▶ Starting GradientBoosting (raw) for 10s [Strategy: CV10]
2025-07-09 15:02:33.706463    • Using built-in feature_i

💾 Checkpoint saved.
2025-07-09 15:03:23.847692 ✔ Completed LightGBM (raw) for 10s (took 0.0s)

2025-07-09 15:03:23.847692 ▶ Starting CatBoost (raw) for 10s [Strategy: CV10]
0:	learn: 0.6659181	total: 6.16ms	remaining: 1.23s
1:	learn: 0.6349384	total: 12ms	remaining: 1.19s
2:	learn: 0.6097384	total: 17.9ms	remaining: 1.17s
3:	learn: 0.5893985	total: 23.3ms	remaining: 1.14s
4:	learn: 0.5733413	total: 27.9ms	remaining: 1.09s
5:	learn: 0.5567765	total: 32.7ms	remaining: 1.06s
6:	learn: 0.5400394	total: 37ms	remaining: 1.02s
7:	learn: 0.5260598	total: 41.5ms	remaining: 996ms
8:	learn: 0.5134171	total: 45.9ms	remaining: 974ms
9:	learn: 0.5032709	total: 50.7ms	remaining: 963ms
10:	learn: 0.4941083	total: 56.1ms	remaining: 963ms
11:	learn: 0.4861387	total: 61.5ms	remaining: 964ms
12:	learn: 0.4769323	total: 67.1ms	remaining: 965ms
13:	learn: 0.4699920	total: 72.5ms	remaining: 963ms
14:	learn: 0.4645830	total: 78.5ms	remaining: 969ms
15:	learn: 0.4584101	total: 84.4ms	remaining: 971ms
16:	learn

193:	learn: 0.3089296	total: 984ms	remaining: 30.4ms
194:	learn: 0.3085324	total: 990ms	remaining: 25.4ms
195:	learn: 0.3080833	total: 995ms	remaining: 20.3ms
196:	learn: 0.3077279	total: 1s	remaining: 15.2ms
197:	learn: 0.3074320	total: 1.01s	remaining: 10.2ms
198:	learn: 0.3070819	total: 1.01s	remaining: 5.09ms
199:	learn: 0.3066969	total: 1.02s	remaining: 0us
2025-07-09 15:03:24.960842    • Using built-in feature_importances_ for CatBoost
💾 Checkpoint saved.
2025-07-09 15:03:25.326522 ✔ Completed CatBoost (raw) for 10s (took 0.0s)

2025-07-09 15:03:25.326522 🏁 All feature importance calculations completed


In [38]:
for model_name in cv_results:
    print(f"\n🔍 Model: {model_name}")
    for scope_name in cv_results[model_name]:
        print(f"  📀 Scope: {scope_name}")
        for variant in cv_results[model_name][scope_name]:
            print(f"    🧪 Variant: {variant}")
            for cv_label in ['CV5', 'CV10']:
                result = cv_results[model_name][scope_name][variant].get(cv_label)
                if result is None:
                    print(f"      ❌ Missing best_params for {cv_label}")
                else:
                    best_params = result.get('best_params')
                    if not best_params:
                        print(f"      ⚠️ Empty best_params for {cv_label}")
                    else:
                        print(f"      ✅ {cv_label} best_params found")



🔍 Model: LogisticRegression
  📀 Scope: 60s
    🧪 Variant: preprocessed
      ✅ CV5 best_params found
      ✅ CV10 best_params found
    🧪 Variant: raw
      ✅ CV5 best_params found
      ✅ CV10 best_params found
  📀 Scope: 70s
    🧪 Variant: preprocessed
      ✅ CV5 best_params found
      ✅ CV10 best_params found
    🧪 Variant: raw
      ✅ CV5 best_params found
      ✅ CV10 best_params found
  📀 Scope: 80s
    🧪 Variant: preprocessed
      ✅ CV5 best_params found
      ✅ CV10 best_params found
    🧪 Variant: raw
      ✅ CV5 best_params found
      ✅ CV10 best_params found
  📀 Scope: 90s
    🧪 Variant: preprocessed
      ✅ CV5 best_params found
      ✅ CV10 best_params found
    🧪 Variant: raw
      ✅ CV5 best_params found
      ✅ CV10 best_params found
  📀 Scope: 00s
    🧪 Variant: preprocessed
      ✅ CV5 best_params found
      ✅ CV10 best_params found
    🧪 Variant: raw
      ✅ CV5 best_params found
      ✅ CV10 best_params found
  📀 Scope: 10s
    🧪 Variant: preprocessed
      ✅ 

In [None]:
# import json
# import os

# def load_best_params(strategy: str, variant: str, model_name: str, decade: str = None) -> dict:
#     """
#     Load best parameters for a given strategy, variant, model, and optional decade.
#     - strategy: '70/30', '80/20', 'CV5', or 'CV10'
#     - variant: 'raw' or 'preprocessed'
#     - model_name: model key (e.g., 'RandomForest')
#     - decade: decade name (e.g., '1990s'); if None, loads overall
#     """
#     base_dir = "best_params"
#     subfolder = "per_decade" if decade else "overall"
#     scope = decade if decade else variant  # path logic
    
#     file_path = os.path.join(
#         base_dir, subfolder, strategy.replace('/', ''),  # e.g., '8020'
#         decade if decade else "",  # '1990s' or ''
#         f"{variant}.json"
#     )

#     try:
#         with open(file_path, "r") as f:
#             params = json.load(f)
#             return params.get(model_name, {})
#     except FileNotFoundError:
#         print(f"❌ File not found: {file_path}")
#         return {}
#     except Exception as e:
#         print(f"❌ Failed to load best params from {file_path}: {e}")
#         return {}


In [46]:
output_dir = "feature_importances"
os.makedirs(output_dir, exist_ok=True)


# Convert importance_results to DataFrames
importance_dfs = {
    'overall': {variant: {} for variant in variant_labels},
    'decades': {
        decade: {
            variant: {
                strategy: None for strategy in strategies
            } for variant in variant_labels
        } for decade in importance_results['decades']
    }
}

# === OVERALL ===
for variant in importance_results['overall']:
    for strategy, model_dict in importance_results['overall'][variant].items():
        df = pd.DataFrame(model_dict)
        importance_dfs['overall'][variant][strategy] = df

# === PER-DECADE ===
for decade in importance_results['decades']:
    for variant in importance_results['decades'][decade]:
        for strategy, model_dict in importance_results['decades'][decade][variant].items():
            df = pd.DataFrame(model_dict)
            importance_dfs['decades'][decade][variant][strategy] = df

# === SAVE OVERALL ===
for variant in importance_dfs['overall']:
    for strategy, df in importance_dfs['overall'][variant].items():
        if strategy == 'CV10' and 'SVC' in df.columns:
            df = df.drop(columns=['SVC'])
            print(f"⚠️ Skipped SVC in CV10 for {variant}")
        if variant == 'preprocessed' and 'CatBoost' in df.columns:
            df = df.drop(columns=['CatBoost'])
            print(f"⚠️ Skipped CatBoost (only works with raw) for {variant}")

        file_path = os.path.join(output_dir, f"feature_importance_overall_{variant}_{strategy}.csv")
        os.makedirs(os.path.dirname(file_path), exist_ok=True)  # <- this line fixes the error
        df.to_csv(file_path)
        print(f"✅ Saved: {file_path}")
# === SAVE PER-DECADE ===
# === SAVE PER-DECADE ===
for decade in importance_dfs['decades']:
    for variant in importance_dfs['decades'][decade]:
        for strategy, df in importance_dfs['decades'][decade][variant].items():
            if strategy == 'CV10' and 'SVC' in df.columns:
                df = df.drop(columns=['SVC'])
                print(f"⚠️ Skipped SVC in CV10 for {variant} ({decade})")
            if variant == 'preprocessed' and 'CatBoost' in df.columns:
                df = df.drop(columns=['CatBoost'])
                print(f"⚠️ Skipped CatBoost (only works with raw) for {variant} ({decade})")

            # Ensure full path exists
            csv_path = os.path.join(output_dir, "per_decade", decade, variant)
            os.makedirs(csv_path, exist_ok=True)

            df_path = os.path.join(csv_path, f"{strategy}.csv")
            os.makedirs(os.path.dirname(df_path), exist_ok=True)  # Just to be extra safe
            df.to_csv(df_path)
            print(f"✅ Saved: {df_path}")



✅ Saved: feature_importances\feature_importance_overall_raw_70/30.csv
✅ Saved: feature_importances\feature_importance_overall_raw_80/20.csv
✅ Saved: feature_importances\feature_importance_overall_raw_CV5.csv
✅ Saved: feature_importances\feature_importance_overall_raw_CV10.csv
⚠️ Skipped CatBoost (only works with raw) for preprocessed
✅ Saved: feature_importances\feature_importance_overall_preprocessed_70/30.csv
⚠️ Skipped CatBoost (only works with raw) for preprocessed
✅ Saved: feature_importances\feature_importance_overall_preprocessed_80/20.csv
⚠️ Skipped CatBoost (only works with raw) for preprocessed
✅ Saved: feature_importances\feature_importance_overall_preprocessed_CV5.csv
⚠️ Skipped CatBoost (only works with raw) for preprocessed
✅ Saved: feature_importances\feature_importance_overall_preprocessed_CV10.csv
✅ Saved: feature_importances\per_decade\60s\raw\70/30.csv
✅ Saved: feature_importances\per_decade\60s\raw\80/20.csv
✅ Saved: feature_importances\per_decade\60s\raw\CV5.csv
⚠️

In [48]:
import os
import matplotlib.pyplot as plt

def save_feature_importance_plots(importance_dfs, output_dir="feature_importance_plots"):
    os.makedirs(output_dir, exist_ok=True)

    # === OVERALL PLOTS ===
    if 'overall' in importance_dfs:
        for variant in importance_dfs['overall']:
            variant_dir = os.path.join(output_dir, "overall", variant)
            os.makedirs(variant_dir, exist_ok=True)

            for strategy, df in importance_dfs['overall'][variant].items():
                if df is None or df.empty:
                    continue

                for model_name in df.columns:
                    if strategy == "CV10" and model_name == "SVC":
                        print(f"🛑 Skipping plot for SVC in CV10 ({variant})")
                        continue
                    if variant == "preprocessed" and model_name == "CatBoost":
                        print(f"🛑 Skipping plot for CatBoost (only raw supported) in ({variant})")
                        continue

                    plt.figure(figsize=(10, 4))
                    df[model_name].sort_values(ascending=False).plot(kind='bar')
                    plt.title(f"{model_name} - Overall ({variant}, {strategy})")
                    plt.ylabel("Normalized Importance")
                    plt.tight_layout()

                    path = os.path.join(variant_dir, f"{model_name}_{strategy}.png")
                    os.makedirs(os.path.dirname(path), exist_ok=True)
                    plt.savefig(path)
                    plt.close()

    # === PER-DECADE PLOTS ===
    if 'decades' in importance_dfs:
        for decade in importance_dfs['decades']:
            for variant in importance_dfs['decades'][decade]:
                for strategy, df in importance_dfs['decades'][decade][variant].items():
                    if df is None or df.empty:
                        continue

                    decade_variant_dir = os.path.join(output_dir, "per_decade", decade, variant, strategy)
                    os.makedirs(decade_variant_dir, exist_ok=True)

                    for model_name in df.columns:
                        if strategy == "CV10" and model_name == "SVC":
                            print(f"🛑 Skipping plot for SVC in CV10 ({decade}, {variant})")
                            continue
                        if variant == "preprocessed" and model_name == "CatBoost":
                            print(f"🛑 Skipping plot for CatBoost (only raw supported) in ({decade}, {variant})")
                            continue

                        plt.figure(figsize=(10, 4))
                        df[model_name].sort_values(ascending=False).plot(kind='bar')
                        plt.title(f"{model_name} - {decade} ({variant}, {strategy})")
                        plt.ylabel("Normalized Importance")
                        plt.tight_layout()

                        path = os.path.join(decade_variant_dir, f"{model_name}.png")
                        os.makedirs(os.path.dirname(path), exist_ok=True)
                        plt.savefig(path)
                        plt.close()

                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
                        
# 🚀 Call the function
save_feature_importance_plots(importance_dfs)
print("✅ All feature importance plots saved to folder 'feature_importance_plots'")


🛑 Skipping plot for CatBoost (only raw supported) in (preprocessed)
🛑 Skipping plot for CatBoost (only raw supported) in (preprocessed)
🛑 Skipping plot for CatBoost (only raw supported) in (preprocessed)
🛑 Skipping plot for CatBoost (only raw supported) in (preprocessed)
🛑 Skipping plot for SVC in CV10 (60s, raw)
🛑 Skipping plot for CatBoost (only raw supported) in (60s, preprocessed)
🛑 Skipping plot for CatBoost (only raw supported) in (60s, preprocessed)
🛑 Skipping plot for SVC in CV10 (70s, raw)
🛑 Skipping plot for CatBoost (only raw supported) in (70s, preprocessed)
🛑 Skipping plot for CatBoost (only raw supported) in (70s, preprocessed)
🛑 Skipping plot for SVC in CV10 (80s, raw)
🛑 Skipping plot for CatBoost (only raw supported) in (80s, preprocessed)
🛑 Skipping plot for CatBoost (only raw supported) in (80s, preprocessed)
🛑 Skipping plot for SVC in CV10 (90s, raw)
🛑 Skipping plot for CatBoost (only raw supported) in (90s, preprocessed)
🛑 Skipping plot for CatBoost (only raw suppor

In [None]:
# output_dir = "./saved_results_light_xg"
# os.makedirs(output_dir, exist_ok=True)

# with open(os.path.join(output_dir, "per_decade_and_overall_results.pkl"), "wb") as f:
#     pickle.dump({
#         "results": results,
#         "cv_results": cv_results
#     }, f)

# with open(os.path.join(output_dir, "best_params_per_decade.pkl"), "wb") as f:
#     pickle.dump(best_params_per_decade, f)

# with open(os.path.join(output_dir, "best_params_overall.pkl"), "wb") as f:
#     pickle.dump(best_params_overall, f)

# best_params_80_20.to_csv(os.path.join(output_dir, "best_params_80_20.csv"))
# best_params_70_30.to_csv(os.path.join(output_dir, "best_params_70_30.csv"))
# best_params_cv5.to_csv(os.path.join(output_dir, "best_params_cv5.csv"))
# best_params_cv10.to_csv(os.path.join(output_dir, "best_params_cv10.csv"))

# importance_df_all.to_csv(os.path.join(output_dir, "feature_importances_all_models.csv"))

# print(f"All results saved under “{output_dir}/”")

In [None]:


# plot_output_dir = os.path.join(output_dir, "plots")
# os.makedirs(plot_output_dir, exist_ok=True)

# avg_importance_all_models = importance_dfs['overall'].mean(axis=1).sort_values(ascending=False)

# plt.figure(figsize=(10, 4))
# avg_importance_all_models.plot(kind='bar')
# plt.title("Average Feature Importance (All Models Combined)")
# plt.ylabel("Normalized Importance")
# plt.tight_layout()
# plt.savefig(os.path.join(plot_output_dir, "avg_importance_all_models.png"))
# plt.show()

# top_n = 3  # 5?
# mean_f1_scores = f1_80_20.mean(axis=0).sort_values(ascending=False)
# top_models = mean_f1_scores.head(top_n).index.tolist()

# print(f"Top {top_n} performing models based on mean F1 (80/20): {top_models}")

# for model in top_models:
#     importance_series = importance_dfs['overall'][model].sort_values(ascending=False)
#     plt.figure(figsize=(10, 4))
#     importance_series.plot(kind='bar')
#     plt.title(f"Feature Importance - Top Model: {model}")
#     plt.ylabel("Normalized Importance")
#     plt.tight_layout()
#     plt.savefig(os.path.join(plot_output_dir, f"importance_top_model_{model}.png"))
#     plt.show()

# avg_importance_all_models.to_csv(os.path.join(plot_output_dir, "avg_importance_all_models.csv"))

# print("✅ Feature importance plots and CSVs saved successfully.")
