In [1]:
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

data = pd.read_csv('master.csv')


#Renaming the columns names for convinience

data.columns = ['country', 'year', 'gender', 'age_group', 'suicide_count', 'population', 'suicides/100k', 'country-year', 'HDI for year',
                'gdp_for_year', 'gdp_per_capita', 'generation']

data = data.drop(['HDI for year', 'country-year'], axis = 1)
data = data.dropna()

#################################################################################
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# Encode 'gender'
le_sex = LabelEncoder()
data['gender_enc'] = le_sex.fit_transform(data['gender'])

# Encode 'age' as ordinal
age_order = ["5-14 years", "15-24 years", "25-34 years", 
             "35-54 years", "55-74 years", "75+ years"]
ord_age = OrdinalEncoder(categories=[age_order])
data['age_enc'] = ord_age.fit_transform(data[['age_group']]).astype(int)

# One-hot encode 'generation'
data = pd.get_dummies(data, columns=['generation'], prefix='gen')
gen_cols = [col for col in data.columns if col.startswith('gen_')]
data[gen_cols] = data[gen_cols].astype(int)

# Converting the column 'gdp_for_year' to float from object
data['gdp_for_year'] = data['gdp_for_year'].str.replace(',','').astype(float)

data = data.drop(['gender', 'age_group'], axis=1)

##################################################################################

from sklearn.model_selection import train_test_split, KFold
import category_encoders as ce

# Define features and target
y = data['suicides/100k']
X = data.drop(columns=['suicides/100k', 'suicide_count', 'population'])

#split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# ---------------------- Target Encoding with K-Fold CV ----------------------

# (a) Create an empty Series to collect fold-wise encodings
country_te_train = pd.Series(index=X_train.index, dtype=float)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr = y_train.iloc[tr_idx]
    
    te_fold = ce.TargetEncoder(cols=['country'])
    te_fold.fit(X_tr[['country']], y_tr)
    
    country_te_train.iloc[val_idx] = (
        te_fold.transform(X_val[['country']])['country']
    ).values

# (b) Fit a final encoder on *all* training data, for the test set
final_te = ce.TargetEncoder(cols=['country'])
final_te.fit(X_train[['country']], y_train)
country_te_test = final_te.transform(X_test[['country']])['country'].values

# (c) Drop the raw 'country' feature and insert the encoded versions
X_train_enc = X_train.drop(columns=['country']).copy()
X_train_enc['country_te'] = country_te_train

X_test_enc = X_test.drop(columns=['country']).copy()
X_test_enc['country_te'] = country_te_test

############################################################################



# --- Now X_train_enc, X_test_enc, y_train, y_test are ready for modeling! --------

print("Train shape:", X_train_enc.shape)
print("Test shape: ", X_test_enc.shape)
#############################################################################


Train shape: (24444, 12)
Test shape:  (6112, 12)


In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold, train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LinearRegression, Ridge

# 1. Identify feature types
continuous_cols = ['gdp_for_year', 'gdp_per_capita', 'country_te', 'year']
categorical_cols = [c for c in X_train_enc.columns if c not in continuous_cols]

# 2. Build a ColumnTransformer that scales and PCA on continuous features; passes through categorical
variance_retained = 0.95
cont_pipeline = Pipeline([
    ('scale', RobustScaler()),
    ('pca', PCA(n_components=variance_retained, random_state=42))
])
ct = ColumnTransformer([
    ('cont', cont_pipeline, continuous_cols),
    ('cat', 'passthrough', categorical_cols)
], verbose=False)

# 3. Define models
models = {
    'KNN': KNeighborsRegressor(n_neighbors=11, weights='distance', p=1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'SVR (Linear)': LinearSVR(C=10, epsilon=1, max_iter=10000, random_state=42),
    'SVR (RBF)': SVR(kernel='rbf', C=100, gamma=0.01),
    'SVR (Poly)': SVR(kernel='poly', degree=3, C=1, gamma='scale'),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.1, random_state=42),
    'MLP': MLPRegressor(random_state=42, max_iter=500),
}

# 4. 10-fold CV on training set with selective PCA
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_results = []
print("\n=== 10-Fold CV with selective PCA on continuous features ===")
for name, estimator in models.items():
    print(f"\n-> {name}")
    maes, mses, rmses = [], [], []
    pipeline = Pipeline([
        ('transform', ct),  # scale + PCA on continuous, passthrough categorical
        ('model', estimator)
    ])
    start = time.time()
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_enc), 1):
        X_tr = X_train_enc.iloc[train_idx]
        X_val = X_train_enc.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        pipeline.fit(X_tr, y_tr)
        y_pred = pipeline.predict(X_val)
        maes.append(mean_absolute_error(y_val, y_pred))
        mses.append(mean_squared_error(y_val, y_pred))
        rmses.append(np.sqrt(mses[-1]))
        print(f"  Fold {fold}/10 RMSE: {rmses[-1]:.4f}")
    elapsed = time.time() - start
    cv_results.append({'Model': name,
                       'MAE': np.mean(maes),
                       'MSE': np.mean(mses),
                       'RMSE': np.mean(rmses)})
    print(f"-> Completed CV for {name} in {elapsed/60:.2f} minutes")

# 5. Test set evaluation
print("\n=== Test Set Evaluation ===")
test_results = []
for record in cv_results:
    name = record['Model']
    pipeline = Pipeline([
        ('transform', ct),
        ('model', models[name])
    ])
    print(f"\n-> Training final {name} on full training set")
    pipeline.fit(X_train_enc, y_train)
    y_pred_test = pipeline.predict(X_test_enc)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)
    test_results.append({'Model': name,
                         'Test MAE': test_mae,
                         'Test MSE': test_mse,
                         'Test RMSE': test_rmse})
    print(f"-> {name} Test RMSE: {test_rmse:.4f}")

# 6. Summaries
cv_df = pd.DataFrame(cv_results)
test_df = pd.DataFrame(test_results)

print("\nCross-Validation Results:")
print(cv_df)
print("\nTest Set Results:")
print(test_df)



=== 10-Fold CV with selective PCA on continuous features ===

-> KNN
  Fold 1/10 RMSE: 17.6374
  Fold 2/10 RMSE: 17.5582
  Fold 3/10 RMSE: 16.8454
  Fold 4/10 RMSE: 19.0670
  Fold 5/10 RMSE: 16.7360
  Fold 6/10 RMSE: 18.9715
  Fold 7/10 RMSE: 17.6718
  Fold 8/10 RMSE: 17.3725
  Fold 9/10 RMSE: 18.1148
  Fold 10/10 RMSE: 17.1584
-> Completed CV for KNN in 0.01 minutes

-> Random Forest
  Fold 1/10 RMSE: 18.0301
  Fold 2/10 RMSE: 17.6742
  Fold 3/10 RMSE: 17.5261
  Fold 4/10 RMSE: 19.6821
  Fold 5/10 RMSE: 17.4338
  Fold 6/10 RMSE: 19.2678
  Fold 7/10 RMSE: 17.1005
  Fold 8/10 RMSE: 18.1242
  Fold 9/10 RMSE: 18.1623
  Fold 10/10 RMSE: 17.1722
-> Completed CV for Random Forest in 0.57 minutes

-> Decision Tree
  Fold 1/10 RMSE: 21.5040
  Fold 2/10 RMSE: 22.1443
  Fold 3/10 RMSE: 21.1517
  Fold 4/10 RMSE: 26.2087
  Fold 5/10 RMSE: 20.0608
  Fold 6/10 RMSE: 23.6865
  Fold 7/10 RMSE: 20.1242
  Fold 8/10 RMSE: 21.7842
  Fold 9/10 RMSE: 22.1367
  Fold 10/10 RMSE: 20.7771
-> Completed CV for D



  Fold 1/10 RMSE: 19.9513




  Fold 2/10 RMSE: 20.0221




  Fold 3/10 RMSE: 18.7158




  Fold 4/10 RMSE: 22.3071




  Fold 5/10 RMSE: 21.4008




  Fold 6/10 RMSE: 21.2098




  Fold 7/10 RMSE: 18.0897




  Fold 8/10 RMSE: 18.1432




  Fold 9/10 RMSE: 21.0822




  Fold 10/10 RMSE: 21.7926
-> Completed CV for SVR (Linear) in 0.76 minutes

-> SVR (RBF)
  Fold 1/10 RMSE: 17.4216
  Fold 2/10 RMSE: 17.3476
  Fold 3/10 RMSE: 16.4839
  Fold 4/10 RMSE: 18.2907
  Fold 5/10 RMSE: 17.4727
  Fold 6/10 RMSE: 18.3157
  Fold 7/10 RMSE: 17.8332
  Fold 8/10 RMSE: 17.1588
  Fold 9/10 RMSE: 18.3395
  Fold 10/10 RMSE: 17.0736
-> Completed CV for SVR (RBF) in 2.09 minutes

-> SVR (Poly)
  Fold 1/10 RMSE: 18.7133
  Fold 2/10 RMSE: 18.5681
  Fold 3/10 RMSE: 18.0352
  Fold 4/10 RMSE: 19.2417
  Fold 5/10 RMSE: 18.9128
  Fold 6/10 RMSE: 20.0397
  Fold 7/10 RMSE: 19.0295
  Fold 8/10 RMSE: 19.0898
  Fold 9/10 RMSE: 19.9213
  Fold 10/10 RMSE: 18.3787
-> Completed CV for SVR (Poly) in 6.77 minutes

-> Linear Regression
  Fold 1/10 RMSE: 18.3157
  Fold 2/10 RMSE: 18.5235
  Fold 3/10 RMSE: 17.2856
  Fold 4/10 RMSE: 20.3365
  Fold 5/10 RMSE: 19.0287
  Fold 6/10 RMSE: 18.5906
  Fold 7/10 RMSE: 17.1253
  Fold 8/10 RMSE: 17.4260
  Fold 9/10 RMSE: 19.2826
  Fold 10/10 RMSE: 19.19



-> SVR (Linear) Test RMSE: 21.4252

-> Training final SVR (RBF) on full training set
-> SVR (RBF) Test RMSE: 17.4132

-> Training final SVR (Poly) on full training set
-> SVR (Poly) Test RMSE: 18.1744

-> Training final Linear Regression on full training set
-> Linear Regression Test RMSE: 18.7912

-> Training final Ridge Regression on full training set
-> Ridge Regression Test RMSE: 18.7912

-> Training final MLP on full training set
-> MLP Test RMSE: 15.8854

Cross-Validation Results:
               Model        MAE         MSE       RMSE
0                KNN   9.283927  314.335292  17.713306
1      Random Forest   9.537167  325.286773  18.017341
2      Decision Tree  11.039073  485.182148  21.957814
3       SVR (Linear)   9.609012  413.047319  20.271479
4          SVR (RBF)   8.571940  309.177396  17.573733
5         SVR (Poly)   9.980645  361.093592  18.993015
6  Linear Regression  10.905969  343.594667  18.511175
7   Ridge Regression  10.905943  343.594659  18.511175
8            