In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold
from category_encoders import TargetEncoder
import numpy as np
import category_encoders as ce

data = pd.read_csv('clean_data.csv')
# Define the desired age groups
desired_age_groups = [
    '10-14 years',
    '15-19 years',
    '20-24 years',
    '25-29 years',
    '30-34 years',
    '35-39 years',
    '40-44 years',
    '45-49 years',
    '50-54 years',
    '55-59 years',
    '60-64 years',
    '65-69 years',
    '70-74 years',
    '75-79 years',
    '80+ years'
]
# Filter the DataFrame
data = data[data['age_group'].isin(desired_age_groups)]

# Encode 'gender'
le_sex = LabelEncoder()
data['gender_enc'] = le_sex.fit_transform(data['gender'])

# Encode 'age_group'
ord_age = OrdinalEncoder(categories=[desired_age_groups])
data['age_enc'] = ord_age.fit_transform(data[['age_group']]).astype(int)

# drop country_code
data = data.drop(['gender', 'age_group', 'country_code'], axis=1)

# Define features (X) and target (y)
y = data['suicide_rate/100k']
X = data.drop(['suicide_rate/100k'], axis=1)

# Split into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# ---------------------- Target Encoding with K-Fold CV ----------------------

# (a) Create an empty Series to collect fold-wise encodings
country_te_train = pd.Series(index=X_train.index, dtype=float)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr = y_train.iloc[tr_idx]
    
    te_fold = ce.TargetEncoder(cols=['country_name'])
    te_fold.fit(X_tr[['country_name']], y_tr)
    
    country_te_train.iloc[val_idx] = (
        te_fold.transform(X_val[['country_name']])['country_name']
    ).values

# (b) Fit a final encoder on *all* training data, for the test set
final_te = ce.TargetEncoder(cols=['country_name'])
final_te.fit(X_train[['country_name']], y_train)
country_te_test = final_te.transform(X_test[['country_name']])['country_name'].values

# (c) Drop the raw 'country_name' feature and insert the encoded versions
X_train_enc = X_train.drop(columns=['country_name']).copy()
X_train_enc['country_te'] = country_te_train

X_test_enc = X_test.drop(columns=['country_name']).copy()
X_test_enc['country_te'] = country_te_test


In [3]:
import os
import pickle
import random
import time
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn_genetic import GAFeatureSelectionCV
from sklearn.preprocessing import StandardScaler

# Reproducibility
random.seed(42)
np.random.seed(42)

# === Load or initialize checkpoint ===
checkpoint_path = "ga_feature_selection_checkpoint_table2.pkl"
if os.path.exists(checkpoint_path):
    with open(checkpoint_path, "rb") as f:
        saved = pickle.load(f)
    completed_models = saved['completed']
    cv_results = saved['cv_results']
    test_results = saved['test_results']
    print(f"Checkpoint loaded. Completed models: {list(completed_models)}")
else:
    completed_models = set()
    cv_results, test_results = [], []

# === Data ===
X_all = X_train_enc.copy()
y_all = y_train.copy()
kf_outer = KFold(n_splits=10, shuffle=True, random_state=42)

# === Define models ===
models = {
    'KNN': make_pipeline(StandardScaler(), KNeighborsRegressor()),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'MLP': make_pipeline(StandardScaler(), MLPRegressor(random_state=42, max_iter=500)),
    'Linear Regression': make_pipeline(StandardScaler(), LinearRegression()),
    'Ridge Regression': make_pipeline(StandardScaler(), Ridge(random_state=42)),
    'Poly SVM': make_pipeline(StandardScaler(), SVR(kernel='poly', degree=3, C=1, gamma='scale')),
    'Linear SVM': make_pipeline(StandardScaler(), LinearSVR(C=1.0, epsilon=0.1, max_iter=10000, random_state=42)),
    'RBF SVM': make_pipeline(StandardScaler(), SVR(kernel='rbf', C=1.0, gamma='scale')),
}

# === Run loop ===
for name, model in models.items():
    if name in completed_models:
        print(f"\n>>> Skipping {name}, already completed.")
        continue

    print(f"\n=== Starting GA feature selection for {name} ===")
    start_time = time.time()

    if name == 'Poly SVM':
        selector = GAFeatureSelectionCV(
            estimator=clone(model),
            cv=5,  # Lighter for faster run
            scoring="neg_root_mean_squared_error",
            population_size=6,
            generations=8,
            n_jobs=-1,
            verbose=2,
            keep_top_k=1,
            elitism=True,
            crossover_probability=0.8,
            mutation_probability=0.2,
            tournament_size=3
        )
    else:
        selector = GAFeatureSelectionCV(
            estimator=clone(model),
            cv=10,
            scoring="neg_root_mean_squared_error",
            population_size=10,
            generations=15,
            n_jobs=-1,
            verbose=2,
            keep_top_k=2,
            elitism=True,
            crossover_probability=0.8,
            mutation_probability=0.2,
            tournament_size=3
        )

    print("-> Fitting selector... (this may take a while)")
    try:
        selector.fit(X_all, y_all)
    except Exception as e:
        print(f"!!! Error fitting GA for {name}: {e}")
        continue

    print(f"-> GA complete for {name}")
    selected = X_all.columns[selector.support_]
    print(f"Selected {len(selected)} features for {name}: {list(selected)}")

    # === Nested CV ===
    maes, mses, rmses = [], [], []
    print(f"-> Running nested CV for {name}")
    for i, (train_idx, val_idx) in enumerate(kf_outer.split(X_all), 1):
        X_tr, X_val = X_all.iloc[train_idx][selected], X_all.iloc[val_idx][selected]
        y_tr, y_val = y_all.iloc[train_idx], y_all.iloc[val_idx]
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        maes.append(mean_absolute_error(y_val, y_pred))
        mses.append(mean_squared_error(y_val, y_pred))
        rmses.append(np.sqrt(mses[-1]))
        print(f"  Fold {i}/10: RMSE = {rmses[-1]:.4f}")

    cv_results.append({'Model': name, 'MAE': np.mean(maes), 'MSE': np.mean(mses), 'RMSE': np.mean(rmses)})

    # === Test evaluation ===
    print(f"-> Evaluating on test set for {name}")
    X_test_sel = X_test_enc[selected]
    model.fit(X_all[selected], y_all)
    y_test_pred = model.predict(X_test_sel)
    test_results.append({
        'Model': name,
        'Test MAE': mean_absolute_error(y_test, y_test_pred),
        'Test MSE': mean_squared_error(y_test, y_test_pred),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred))
    })
    print(f"=== Finished {name}: Test RMSE = {test_results[-1]['Test RMSE']:.4f} ===")

    # Print model duration
    elapsed = time.time() - start_time
    print(f"--- {name} took {elapsed/60:.2f} minutes ---")

    # === Save checkpoint ===
    completed_models.add(name)
    with open(checkpoint_path, "wb") as f:
        pickle.dump({
            'completed': completed_models,
            'cv_results': cv_results,
            'test_results': test_results
        }, f)
    print(f"Checkpoint saved after {name}")

# === Final summary ===
print("\n=== All Models Completed ===")
df_cv = pd.DataFrame(cv_results)
df_test = pd.DataFrame(test_results)
print("\nCross-Validation Results:")
print(df_cv)
print("\nTest Set Results:")
print(df_test)



=== Starting GA feature selection for KNN ===
-> Fitting selector... (this may take a while)
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	10    	-17.1814	8.04748    	-3.06962   	-25.0366   
1  	20    	-9.2717 	6.50146    	-3.18867   	-18.8587   
2  	20    	-3.94449	1.49743    	-3.05978   	-8.27103   
3  	20    	-3.11938	0.335554   	-2.73579   	-3.98973   
4  	20    	-3.01003	0.357997   	-2.73579   	-3.98973   
5  	20    	-2.82121	0.11627    	-2.73579   	-3.05978   
6  	20    	-2.78075	0.0948761  	-2.73579   	-3.05978   
7  	20    	-2.73997	0.0125591  	-2.73579   	-2.77765   
8  	20    	-2.73579	4.44089e-16	-2.73579   	-2.73579   
9  	20    	-2.73579	4.44089e-16	-2.73579   	-2.73579   
10 	20    	-2.73579	4.44089e-16	-2.73579   	-2.73579   
11 	20    	-2.73579	4.44089e-16	-2.73579   	-2.73579   
12 	20    	-2.73579	4.44089e-16	-2.73579   	-2.73579   
13 	20    	-2.73579	4.44089e-16	-2.73579   	-2.73579   
14 	20    	-2.73579	4.44089e-16	-2.73579   	-2.73579   
15 	20    



-> GA complete for MLP
Selected 10 features for MLP: ['year', 'alcohol_consumption_per_capita', 'depression_rate', 'gdp_per_capita', 'HDI', 'life_expectancy', 'unemployment_rate', 'gender_enc', 'age_enc', 'country_te']
-> Running nested CV for MLP




  Fold 1/10: RMSE = 5.3199




  Fold 2/10: RMSE = 5.3079




  Fold 3/10: RMSE = 5.1837




  Fold 4/10: RMSE = 5.1030




  Fold 5/10: RMSE = 5.1552




  Fold 6/10: RMSE = 5.1393
  Fold 7/10: RMSE = 5.2329




  Fold 8/10: RMSE = 5.3024




  Fold 9/10: RMSE = 5.1643




  Fold 10/10: RMSE = 5.1216
-> Evaluating on test set for MLP




=== Finished MLP: Test RMSE = 5.2043 ===
--- MLP took 61.14 minutes ---
Checkpoint saved after MLP

=== Starting GA feature selection for Linear Regression ===
-> Fitting selector... (this may take a while)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	10    	-20.39 	2.16297    	-16.9035   	-23.7781   
1  	20    	-18.1806	1.2938     	-16.902    	-19.7899   
2  	20    	-17.1208	0.651377   	-16.902    	-19.0749   
3  	20    	-16.902 	0.00131921 	-16.8997   	-16.9035   
4  	20    	-16.9007	0.00141897 	-16.8994   	-16.9034   
5  	20    	-16.8999	0.000926586	-16.8993   	-16.902    
6  	20    	-16.8896	0.0196564  	-16.8503   	-16.8997   
7  	20    	-16.8649	0.0225405  	-16.8499   	-16.8994   
8  	20    	-16.85  	0.000172649	-16.8499   	-16.8504   
9  	20    	-16.8548	0.0147532  	-16.8499   	-16.8991   
10 	20    	-16.8548	0.0147593  	-16.8498   	-16.8991   
11 	20    	-16.8499	9.12439e-05	-16.8497   	-16.8499   
12 	20    	-16.8498	0.000106518	-16.8497   	-16.8499   
13 	20    	