In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold
from category_encoders import TargetEncoder
import numpy as np
import category_encoders as ce

data = pd.read_csv('clean_data.csv')
# Define the desired age groups
desired_age_groups = [
    '10-14 years',
    '15-19 years',
    '20-24 years',
    '25-29 years',
    '30-34 years',
    '35-39 years',
    '40-44 years',
    '45-49 years',
    '50-54 years',
    '55-59 years',
    '60-64 years',
    '65-69 years',
    '70-74 years',
    '75-79 years',
    '80+ years'
]
# Filter the DataFrame
data = data[data['age_group'].isin(desired_age_groups)]

# Encode 'gender'
le_sex = LabelEncoder()
data['gender_enc'] = le_sex.fit_transform(data['gender'])

# Encode 'age_group'
ord_age = OrdinalEncoder(categories=[desired_age_groups])
data['age_enc'] = ord_age.fit_transform(data[['age_group']]).astype(int)

# drop country_code
data = data.drop(['gender', 'age_group', 'country_code'], axis=1)

# Define features (X) and target (y)
y = data['suicide_rate/100k']
X = data.drop(['suicide_rate/100k'], axis=1)

# Split into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# ---------------------- Target Encoding with K-Fold CV ----------------------

# (a) Create an empty Series to collect fold-wise encodings
country_te_train = pd.Series(index=X_train.index, dtype=float)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr = y_train.iloc[tr_idx]
    
    te_fold = ce.TargetEncoder(cols=['country_name'])
    te_fold.fit(X_tr[['country_name']], y_tr)
    
    country_te_train.iloc[val_idx] = (
        te_fold.transform(X_val[['country_name']])['country_name']
    ).values

# (b) Fit a final encoder on *all* training data, for the test set
final_te = ce.TargetEncoder(cols=['country_name'])
final_te.fit(X_train[['country_name']], y_train)
country_te_test = final_te.transform(X_test[['country_name']])['country_name'].values

# (c) Drop the raw 'country_name' feature and insert the encoded versions
X_train_enc = X_train.drop(columns=['country_name']).copy()
X_train_enc['country_te'] = country_te_train

X_test_enc = X_test.drop(columns=['country_name']).copy()
X_test_enc['country_te'] = country_te_test


In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold, train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler

pca = PCA(n_components=0.95)
# 3. Define models
models = {
    'KNN': make_pipeline(StandardScaler(), pca, KNeighborsRegressor()),
    'Random Forest': make_pipeline(StandardScaler(), pca, RandomForestRegressor(random_state=42)),
    'Decision Tree': make_pipeline(StandardScaler(), pca, DecisionTreeRegressor(random_state=42)),
    'MLP': make_pipeline(StandardScaler(), pca, MLPRegressor(random_state=42, max_iter=500)),
    'Linear Regression': make_pipeline(StandardScaler(), pca, LinearRegression()),
    'Ridge Regression': make_pipeline(StandardScaler(), pca, Ridge(random_state=42)),
    'Poly SVM': make_pipeline(StandardScaler(), pca, SVR(kernel='poly', degree=3, C=1, gamma='scale')),
    'Linear SVM': make_pipeline(StandardScaler(), pca, LinearSVR(C=1.0, epsilon=0.1, max_iter=10000, random_state=42)),
    'RBF SVM': make_pipeline(StandardScaler(), pca, SVR(kernel='rbf', C=1.0, gamma='scale')),
}

# 3. 10-Fold CV performance evaluation
print("Starting 10-Fold cross-validation with PCA...")
kf_cv = KFold(n_splits=10, shuffle=True, random_state=42)
results = []
for name, model in models.items():
    print(f"\n=== Cross-validating {name} ===")
    mae_scores, mse_scores, rmse_scores = [], [], []
    for fold, (train_idx, val_idx) in enumerate(kf_cv.split(X_train_enc), 1):
        print(f"Training fold {fold}/10 for {name}...")
        X_tr, X_val = X_train_enc.iloc[train_idx], X_train_enc.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        rmse = np.sqrt(mse)
        mae_scores.append(mae)
        mse_scores.append(mse)
        rmse_scores.append(rmse)
        print(f"Fold {fold} - MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}")
    results.append({
        'Model': name,
        'MAE': np.mean(mae_scores),
        'MSE': np.mean(mse_scores),
        'RMSE': np.mean(rmse_scores)
    })
    print(f"{name} Mean -> MAE: {np.mean(mae_scores):.4f}, MSE: {np.mean(mse_scores):.4f}, RMSE: {np.mean(rmse_scores):.4f}\n")

# 4. Summarize results in a table
print("=== 10-Fold CV Results with PCA ===")
df_results = pd.DataFrame(results).set_index('Model')
print(df_results)


Starting 10-Fold cross-validation with PCA...

=== Cross-validating KNN ===
Training fold 1/10 for KNN...
Fold 1 - MAE: 1.8758, MSE: 18.7295, RMSE: 4.3278
Training fold 2/10 for KNN...
Fold 2 - MAE: 1.8218, MSE: 16.6161, RMSE: 4.0763
Training fold 3/10 for KNN...
Fold 3 - MAE: 1.8416, MSE: 17.5514, RMSE: 4.1894
Training fold 4/10 for KNN...
Fold 4 - MAE: 1.8735, MSE: 18.1141, RMSE: 4.2561
Training fold 5/10 for KNN...
Fold 5 - MAE: 1.8417, MSE: 17.5317, RMSE: 4.1871
Training fold 6/10 for KNN...
Fold 6 - MAE: 1.8183, MSE: 17.3824, RMSE: 4.1692
Training fold 7/10 for KNN...
Fold 7 - MAE: 1.8659, MSE: 17.7283, RMSE: 4.2105
Training fold 8/10 for KNN...
Fold 8 - MAE: 1.8124, MSE: 16.1701, RMSE: 4.0212
Training fold 9/10 for KNN...
Fold 9 - MAE: 1.8229, MSE: 18.3373, RMSE: 4.2822
Training fold 10/10 for KNN...
Fold 10 - MAE: 1.7421, MSE: 14.8889, RMSE: 3.8586
KNN Mean -> MAE: 1.8316, MSE: 17.3050, RMSE: 4.1578


=== Cross-validating Random Forest ===
Training fold 1/10 for Random Forest...



Fold 1 - MAE: 3.9998, MSE: 39.5200, RMSE: 6.2865
Training fold 2/10 for MLP...




Fold 2 - MAE: 3.7455, MSE: 35.8340, RMSE: 5.9862
Training fold 3/10 for MLP...




Fold 3 - MAE: 3.7775, MSE: 35.5034, RMSE: 5.9585
Training fold 4/10 for MLP...




Fold 4 - MAE: 3.7604, MSE: 37.6861, RMSE: 6.1389
Training fold 5/10 for MLP...




Fold 5 - MAE: 3.6633, MSE: 33.9496, RMSE: 5.8266
Training fold 6/10 for MLP...




Fold 6 - MAE: 3.7756, MSE: 34.5119, RMSE: 5.8747
Training fold 7/10 for MLP...




Fold 7 - MAE: 3.8016, MSE: 35.1606, RMSE: 5.9296
Training fold 8/10 for MLP...




Fold 8 - MAE: 3.8003, MSE: 37.1312, RMSE: 6.0935
Training fold 9/10 for MLP...




Fold 9 - MAE: 3.7590, MSE: 34.6060, RMSE: 5.8827
Training fold 10/10 for MLP...




Fold 10 - MAE: 3.7204, MSE: 33.8016, RMSE: 5.8139
MLP Mean -> MAE: 3.7803, MSE: 35.7705, RMSE: 5.9791


=== Cross-validating Linear Regression ===
Training fold 1/10 for Linear Regression...
Fold 1 - MAE: 11.1751, MSE: 291.7347, RMSE: 17.0802
Training fold 2/10 for Linear Regression...
Fold 2 - MAE: 11.0474, MSE: 283.6794, RMSE: 16.8428
Training fold 3/10 for Linear Regression...
Fold 3 - MAE: 11.1499, MSE: 297.9565, RMSE: 17.2614
Training fold 4/10 for Linear Regression...
Fold 4 - MAE: 11.3340, MSE: 301.8359, RMSE: 17.3734
Training fold 5/10 for Linear Regression...
Fold 5 - MAE: 11.1846, MSE: 295.1069, RMSE: 17.1787
Training fold 6/10 for Linear Regression...
Fold 6 - MAE: 11.0714, MSE: 282.2101, RMSE: 16.7991
Training fold 7/10 for Linear Regression...
Fold 7 - MAE: 10.8193, MSE: 281.9254, RMSE: 16.7906
Training fold 8/10 for Linear Regression...
Fold 8 - MAE: 10.8183, MSE: 270.9516, RMSE: 16.4606
Training fold 9/10 for Linear Regression...
Fold 9 - MAE: 11.3107, MSE: 316.8653, RMS

In [None]:

# -----------------------------------------------------------------------------
# TEST SET EVALUATION
# -----------------------------------------------------------------------------

test_results = []
for name, model in models.items():
    print(f"\n=== Training {name} on full training set and evaluating on test set ===")
    model.fit(X_train_enc, y_train)
    y_pred_test = model.predict(X_test_enc)
    
    mae_test  = mean_absolute_error(y_test, y_pred_test)
    mse_test  = mean_squared_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    
    test_results.append({
        'Model': name,
        'Test MAE': mae_test,
        'Test MSE': mse_test,
        'Test RMSE': rmse_test
    })
    print(f"{name} - Test MAE: {mae_test:.4f}, MSE: {mse_test:.4f}, RMSE: {rmse_test:.4f}")




# visualize comparison side by side
# df_combined.plot(kind='bar', figsize=(14, 7), color=['lightblue', 'salmon'])
# plt.title('CV vs Test Error Comparison')
# plt.tight_layout()
# plt.show()



=== Training KNN on full training set and evaluating on test set ===
KNN - Test MAE: 1.7241, MSE: 14.5774, RMSE: 3.8180

=== Training Random Forest on full training set and evaluating on test set ===
Random Forest - Test MAE: 1.2887, MSE: 7.7536, RMSE: 2.7845

=== Training Decision Tree on full training set and evaluating on test set ===
Decision Tree - Test MAE: 1.5801, MSE: 22.7053, RMSE: 4.7650

=== Training MLP on full training set and evaluating on test set ===




MLP - Test MAE: 3.7018, MSE: 33.9297, RMSE: 5.8249

=== Training Linear Regression on full training set and evaluating on test set ===
Linear Regression - Test MAE: 11.0220, MSE: 271.3364, RMSE: 16.4723

=== Training Ridge Regression on full training set and evaluating on test set ===
Ridge Regression - Test MAE: 11.0220, MSE: 271.3361, RMSE: 16.4723

=== Training Poly SVM on full training set and evaluating on test set ===
Poly SVM - Test MAE: 7.9246, MSE: 218.7989, RMSE: 14.7919

=== Training Linear SVM on full training set and evaluating on test set ===
Linear SVM - Test MAE: 9.5284, MSE: 333.8083, RMSE: 18.2704

=== Training RBF SVM on full training set and evaluating on test set ===
RBF SVM - Test MAE: 4.4083, MSE: 92.3485, RMSE: 9.6098

=== Test Set Results ===
                    Test MAE    Test MSE  Test RMSE
Model                                              
KNN                 1.724081   14.577361   3.818031
Random Forest       1.288728    7.753625   2.784533
Decision Tree 

NameError: name 'df_cv' is not defined

In [4]:
# Summarize Test results
df_test = pd.DataFrame(test_results).set_index('Model')
print("\n=== Test Set Results ===")
print(df_test)


# Merge CV and Test results for comparison
df_combined = df_results.join(df_test)
print("\n=== CV vs Test Comparison ===")
print(df_combined)


=== Test Set Results ===
                    Test MAE    Test MSE  Test RMSE
Model                                              
KNN                 1.724081   14.577361   3.818031
Random Forest       1.288728    7.753625   2.784533
Decision Tree       1.580071   22.705250   4.765003
MLP                 3.701752   33.929709   5.824921
Linear Regression  11.022036  271.336386  16.472291
Ridge Regression   11.021973  271.336144  16.472284
Poly SVM            7.924602  218.798943  14.791854
Linear SVM          9.528416  333.808330  18.270422
RBF SVM             4.408309   92.348503   9.609813

=== CV vs Test Comparison ===
                         MAE         MSE       RMSE   Test MAE    Test MSE  \
Model                                                                        
KNN                 1.831616   17.304989   4.157839   1.724081   14.577361   
Random Forest       1.443446   10.462263   3.231661   1.288728    7.753625   
Decision Tree       1.778833   28.265591   5.306073   1.580