In [76]:
# Data Analysis
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [77]:
def preprocess_classification(dataset, threshold_factor=1.0):
    data = pd.read_csv(dataset)
    
    # Remove duplicate attributes
    data.drop(["suicides_no", "population", "country-year", "gdp_for_year ($)"], axis=1, inplace=True)
    data.drop("HDI for year", axis=1, inplace=True)
    
    # Remove 'years' text from 'age' and convert age strings to integers
    data["age"] = data["age"].str.strip(" years")
    def convert_age(age):
        if '75+' in age:
            return 80
        elif '55-74' in age:
            return 65
        elif '35-54' in age:
            return 45
        elif '25-34' in age:
            return 30
        elif '15-24' in age:
            return 20
        elif '5-14' in age:
            return 10
        else:
            return None
    data['age'] = data['age'].apply(convert_age)
    
    # Encode sex using a mapping
    sex_mapping = {'female': 0, 'male': 1}
    data['sex'] = data['sex'].map(sex_mapping)
    
    # Apply one-hot encoding to 'generation'
    data = pd.get_dummies(data, columns=['generation'])
    
    # Calculate the mean suicide rate and adjust by the threshold_factor
    mean_suicide_rate = data["suicides/100k pop"].mean()
    adjusted_threshold = mean_suicide_rate * threshold_factor

    # Create a new attribute, Over_threshold_suicides
    # 1 if the suicide rate is higher than the adjusted threshold, 0 otherwise
    data["Over_threshold_suicides"] = np.where(data["suicides/100k pop"] > adjusted_threshold, 1, 0)
    
    # Perform target encoding using KFold for 'country'
    kf = KFold(n_splits=10, shuffle=True, random_state=33)
    data['country_encoded'] = 0.0
    for train_index, val_index in kf.split(data):
        train_fold, val_fold = data.iloc[train_index], data.iloc[val_index]
        mean_encoded = train_fold.groupby('country')['suicides/100k pop'].mean()
        data.loc[val_index, 'country_encoded'] = val_fold['country'].map(mean_encoded)
    
    # Fill missing 'country_encoded' values with global mean
    global_mean = data['suicides/100k pop'].mean()
    data['country_encoded'].fillna(global_mean, inplace=True)
    
    # Remove the original 'country' column
    data.drop(columns=['country'], inplace=True)
    
    # Split the dataset into training and testing datasets
    train_data, test_data = train_test_split(data, test_size=0.3, random_state=33)
    
    # Drop unnecessary columns
    columns_to_drop = ["suicides/100k pop", "Over_threshold_suicides"]
    X_train = train_data.drop(columns=columns_to_drop)
    Y_train = train_data["Over_threshold_suicides"]
    X_test = test_data.drop(columns=columns_to_drop)
    Y_test = test_data["Over_threshold_suicides"]
    
    return X_train, Y_train, X_test, Y_test

In [78]:
def adjust_weights(weights):
    # Calculate the sum of all provided weights
    total = sum(weights.values())
    
    # Normalize each weight by dividing by the total sum to adjust their proportions
    adjusted_weights = {key: value / total for key, value in weights.items()}
    
    return adjusted_weights

# Example weights input from the user
input_weights = {
    'test_accuracy': 1,  # Weight for test accuracy
    'precision': 3,      # Weight for precision
    'recall': 3,         # Weight for recall
    'f1_positive': 1,    # Weight for F1 score of the positive class
    'f1_negative' : 4,   # Weight for F1 score of the negative class
    'cv_accuracy': 1     # Weight for cross-validation accuracy
}

# Adjust the weights to ensure their sum equals 1
WEIGHTS = adjust_weights(input_weights)

# Print the adjusted weights
print("Adjusted Weights:")
for key, value in WEIGHTS.items():
    print(f"{key}: {value:.2f}")


Adjusted Weights:
test_accuracy: 0.08
precision: 0.23
recall: 0.23
f1_positive: 0.08
f1_negative: 0.31
cv_accuracy: 0.08


In [79]:
def evaluate_logistic_regression(X_train, Y_train, X_test, Y_test):
    # Create a logistic regression pipeline with feature scaling
    logreg = make_pipeline(StandardScaler(), LogisticRegression(solver='lbfgs', max_iter=1000))
    logreg.fit(X_train, Y_train)  # Train the model
    
    # Make predictions and calculate accuracy on the test set
    Y_pred_test = logreg.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_pred_test)

    # Compute precision and recall for the model using macro averaging
    precision = precision_score(Y_test, Y_pred_test, average='macro')
    recall = recall_score(Y_test, Y_pred_test, average='macro')
    
    # Compute F1 scores for each class and extract separately
    f1_scores = f1_score(Y_test, Y_pred_test, average=None)
    f1_positive, f1_negative = f1_scores[0], f1_scores[1]
    
    # Conduct 10-fold cross-validation to estimate model stability
    kf = KFold(n_splits=10, shuffle=True, random_state=33)
    cv_accuracy = cross_val_score(logreg, X_train, Y_train, cv=kf, scoring='accuracy').mean()
    
    # Calculate integrated score using weighted averages from WEIGHTS
    integrated_score = (
        test_accuracy * WEIGHTS['test_accuracy'] +
        precision * WEIGHTS['precision'] +
        recall * WEIGHTS['recall'] +
        f1_positive * WEIGHTS['f1_positive'] +
        f1_negative * WEIGHTS['f1_negative'] +
        cv_accuracy * WEIGHTS['cv_accuracy']
    )
    
    return integrated_score * 100 # Convert to percentage

In [80]:
def evaluate_decision_tree(X_train, Y_train, X_test, Y_test):
    # Initialize and train a Decision Tree model
    decision_tree = DecisionTreeClassifier(random_state=33)
    decision_tree.fit(X_train, Y_train)

    # Predict and evaluate accuracy on both training and test sets
    Y_pred_test = decision_tree.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_pred_test)

    # Calculate precision and recall using macro averaging to treat all classes equally
    precision = precision_score(Y_test, Y_pred_test, average='macro', zero_division=0)
    recall = recall_score(Y_test, Y_pred_test, average='macro', zero_division=0)
    
    # Compute F1 scores for positive and negative classes individually
    f1_scores = f1_score(Y_test, Y_pred_test, average=None)
    f1_positive, f1_negative = f1_scores[0], f1_scores[1]  # F1 scores for class 0 and class 1
    
    # Perform 10-fold cross-validation to estimate model robustness
    kf = KFold(n_splits=10, shuffle=True, random_state=33)
    cv_accuracy = cross_val_score(decision_tree, X_train, Y_train, cv=kf, scoring='accuracy').mean()
    
    # Calculate an integrated score based on predefined weights
    integrated_score = (test_accuracy * WEIGHTS['test_accuracy'] +
                        precision * WEIGHTS['precision'] +
                        recall * WEIGHTS['recall'] +
                        f1_positive * WEIGHTS['f1_positive'] +
                        f1_negative * WEIGHTS['f1_negative'] +
                        cv_accuracy * WEIGHTS['cv_accuracy'])

    return integrated_score * 100 # Convert to percentage

In [81]:

def evaluate_knn(X_train, Y_train, X_test, Y_test):
    # Initialize and train a K-Nearest Neighbors model with 3 neighbors
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, Y_train)

    # Predict and evaluate accuracy on both training and test sets
    Y_pred_train = knn.predict(X_train)
    Y_pred_test = knn.predict(X_test)
    train_accuracy = accuracy_score(Y_train, Y_pred_train)
    test_accuracy = accuracy_score(Y_test, Y_pred_test)

    # Calculate precision, recall, and F1 scores using macro averaging
    precision = precision_score(Y_test, Y_pred_test, average='macro')
    recall = recall_score(Y_test, Y_pred_test, average='macro')
    
    # Compute F1 scores for positive and negative classes individually
    f1_scores = f1_score(Y_test, Y_pred_test, average=None)
    f1_positive, f1_negative = f1_scores[0], f1_scores[1]  # F1 scores for class 0 and class 1

    # Perform 10-fold cross-validation to estimate model robustness
    kf = KFold(n_splits=10, shuffle=True, random_state=33)
    cv_accuracy = cross_val_score(knn, X_train, Y_train, cv=kf, scoring='accuracy').mean()

    # Calculate an integrated score based on predefined weights
    integrated_score = (test_accuracy * WEIGHTS['test_accuracy'] +
                        precision * WEIGHTS['precision'] +
                        recall * WEIGHTS['recall'] +
                        f1_positive * WEIGHTS['f1_positive'] +
                        f1_negative * WEIGHTS['f1_negative'] +
                        cv_accuracy * WEIGHTS['cv_accuracy'])

    return integrated_score * 100 # Convert to percentage     

In [82]:
def evaluate_gaussian_nb(X_train, Y_train, X_test, Y_test):
    # Initialize and train Gaussian Naive Bayes model
    gaussian = GaussianNB()
    gaussian.fit(X_train, Y_train)

    # Predict on the test set and calculate accuracy
    Y_pred_test = gaussian.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_pred_test)

    # Calculate precision, recall, and F1 score using macro averaging
    precision = precision_score(Y_test, Y_pred_test, average='macro', zero_division=0)
    recall = recall_score(Y_test, Y_pred_test, average='macro', zero_division=0)
    
    # Compute F1 scores for positive and negative classes individually
    f1_scores = f1_score(Y_test, Y_pred_test, average=None)
    f1_positive, f1_negative = f1_scores[0], f1_scores[1]  # F1 scores for class 0 and class 1

    # Perform 10-fold cross-validation to estimate the model's robustness
    kf = KFold(n_splits=10, shuffle=True, random_state=33)
    cv_accuracy = cross_val_score(gaussian, X_train, Y_train, cv=kf, scoring='accuracy').mean()

    # Calculate an integrated score based on predefined weights
    integrated_score = (test_accuracy * WEIGHTS['test_accuracy'] +
                        precision * WEIGHTS['precision'] +
                        recall * WEIGHTS['recall'] +
                        f1_positive * WEIGHTS['f1_positive'] +
                        f1_negative * WEIGHTS['f1_negative'] +
                        cv_accuracy * WEIGHTS['cv_accuracy'])

    return integrated_score * 100  # Convert to percentage

In [83]:
def evaluate_random_forest(X_train, Y_train, X_test, Y_test):
    # Initialize and train a Random Forest classifier
    random_forest = RandomForestClassifier(n_estimators=100, random_state=33)
    random_forest.fit(X_train, Y_train)

    # Predict and evaluate metrics on both training and test sets
    Y_pred_test = random_forest.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_pred_test)

    # Calculate precision, recall, and F1 scores using macro averaging
    precision = precision_score(Y_test, Y_pred_test, average='macro')
    recall = recall_score(Y_test, Y_pred_test, average='macro')
    
    # Compute F1 scores for positive and negative classes individually
    f1_scores = f1_score(Y_test, Y_pred_test, average=None)
    f1_positive, f1_negative = f1_scores[0], f1_scores[1]  # F1 scores for class 0 and class 1

    # Perform 10-fold cross-validation to estimate model robustness
    kf = KFold(n_splits=10, shuffle=True, random_state=33)
    cv_accuracy = cross_val_score(random_forest, X_train, Y_train, cv=kf, scoring='accuracy').mean()

    # Calculate an integrated score based on predefined weights
    integrated_score = (test_accuracy * WEIGHTS['test_accuracy'] +
                        precision * WEIGHTS['precision'] +
                        recall * WEIGHTS['recall'] +
                        f1_positive * WEIGHTS['f1_positive'] +
                        f1_negative * WEIGHTS['f1_negative'] +
                        cv_accuracy * WEIGHTS['cv_accuracy'])

    return integrated_score * 100  # Convert to percentage

In [85]:

class ModelEvaluator:
    def __init__(self, dataset, models, threshold_factors, top_n=5):
        self.dataset = dataset
        self.models = models
        self.threshold_factors = threshold_factors
        self.top_n = top_n

    def evaluate_models(self):
        results = []
        for threshold in self.threshold_factors:
            X_train, Y_train, X_test, Y_test = preprocess_classification(self.dataset, threshold)
            for model_name, model_func in self.models.items():
                score = model_func(X_train, Y_train, X_test, Y_test)
                results.append({
                    'model': model_name,
                    'threshold': threshold,
                    'score': score
                })
        # Sorting and returning top n results
        return sorted(results, key=lambda x: x['score'], reverse=True)[:self.top_n]

# Model functions need to be defined elsewhere in the script or as imports

# Configuration
dataset_path = 'master.csv'
threshold_range = np.linspace(0.5, 1.5, 21)  # Range of thresholds from 0.5 to 1.5
top_results_count = 20  # Number of top models to return

# Dictionary of models to evaluate
models_to_evaluate = {
    'Logistic Regression': evaluate_logistic_regression,
    'Decision Tree': evaluate_decision_tree,

}

# Creating an instance of the evaluator
evaluator = ModelEvaluator(dataset_path, models_to_evaluate, threshold_range, top_results_count)
best_models = evaluator.evaluate_models()

# Display results
for idx, result in enumerate(best_models, start=1):
    print(f"Rank {idx}:")
    print(f"\tModel - {result['model']}")
    print(f"\tThreshold Factor - {result['threshold']:.5f}")
    print(f"\tIntegrated Score - {result['score']:.5f}")
    print()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['country_encoded'].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['country_encoded'].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

Rank 1:
	Model - Decision Tree
	Threshold Factor - 0.70000
	Integrated Score - 89.12256

Rank 2:
	Model - Decision Tree
	Threshold Factor - 0.75000
	Integrated Score - 88.93362

Rank 3:
	Model - Decision Tree
	Threshold Factor - 0.55000
	Integrated Score - 88.90082

Rank 4:
	Model - Decision Tree
	Threshold Factor - 1.00000
	Integrated Score - 88.83696

Rank 5:
	Model - Decision Tree
	Threshold Factor - 0.60000
	Integrated Score - 88.77400

Rank 6:
	Model - Decision Tree
	Threshold Factor - 0.50000
	Integrated Score - 88.69644

Rank 7:
	Model - Decision Tree
	Threshold Factor - 0.95000
	Integrated Score - 88.63702

Rank 8:
	Model - Decision Tree
	Threshold Factor - 0.65000
	Integrated Score - 88.63049

Rank 9:
	Model - Decision Tree
	Threshold Factor - 1.05000
	Integrated Score - 88.12560

Rank 10:
	Model - Decision Tree
	Threshold Factor - 0.80000
	Integrated Score - 88.11487

Rank 11:
	Model - Decision Tree
	Threshold Factor - 0.90000
	Integrated Score - 87.81475

Rank 12:
	Model - D