# Ordinal Logistic Regression (Thresholds)


In [1]:
# Importing Libraries
import os
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics import accuracy_score
from scipy.stats import pearsonr, kendalltau
from sklearn.model_selection import GroupKFold
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from statsmodels.miscmodels.ordinal_model import OrderedModel
from statsmodels.tools.sm_exceptions import HessianInversionWarning

In [2]:
project_directory = r'C:\Users\marco\OneDrive\Desktop\Final Year Project'
os.chdir(project_directory)
base_dir = os.getcwd() 

In [5]:
def label_percentile(y_values):
    
    all_thresholds = []

    
    all_thresholds.extend([
        np.percentile(y_values, 80),
        np.percentile(y_values, 60),
        np.percentile(y_values, 40),
        np.percentile(y_values, 20),
    ])

    
    all_thresholds.extend([
       np.percentile(y_values, 86),
       np.percentile(y_values, 72),
       np.percentile(y_values, 58),
       np.percentile(y_values, 43),
       np.percentile(y_values, 28),
       np.percentile(y_values, 14),
    ])

    
    all_thresholds = sorted(set(all_thresholds))
    labels = np.digitize(y_values, all_thresholds)
    return labels 

# File paths
input_file = os.path.join('AGAIN Ranking Algorithms', 'intervals_data.csv')
output_file = os.path.join('AGAIN Ranking Algorithms', 'Data_Percentiles', 'ordinal_logistic_regression_percentiles.csv')

df = pd.read_csv(input_file)

def concordance_correlation_coefficient(y_true, y_pred):
    cor = np.corrcoef(y_true, y_pred)[0][1]
    mean_true, mean_pred = np.mean(y_true), np.mean(y_pred)
    var_true, var_pred = np.var(y_true), np.var(y_pred)
    sd_true, sd_pred = np.std(y_true), np.std(y_pred)
    numerator = 2 * cor * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred)**2
    ccc = numerator / denominator
    return ccc

def pearson_correlation_coefficient(y_true, y_pred):
    pcc, _ = pearsonr(y_true, y_pred)
    return pcc

def kendalls_tau_coefficient(y_true, y_pred):
    tau, _ = kendalltau(y_true, y_pred)
    return tau

def evaluate_individual_performance(X_test, Y_test, group_labels, linpred_test, game_name):
    evaluation_results = []
    for participant_id in np.unique(group_labels):
        idx = group_labels == participant_id
        participant_labels = Y_test[idx]
        participant_predictions = linpred_test[idx]
        pcc_value = pearsonr(participant_labels, participant_predictions)[0]
        ccc_value = concordance_correlation_coefficient(participant_labels, participant_predictions)
        kendall_tau_value = kendalltau(participant_labels, participant_predictions)[0]

        evaluation_results.append({
            'Game Name': game_name,
            'Participant ID': participant_id,
            'PCC': pcc_value,
            'CCC': ccc_value,
            'KendallTau': kendall_tau_value
        })
    return pd.DataFrame(evaluation_results)

# Label arousal values using percentile based categories and save in a new column
df['arousal_label']= label_percentile(df['[output]arousal'].values)

# Save the dataframe with new labels into a new csv file
df.to_csv(output_file, index=False)
print("The csv file with percentile labels has been created successfully.")

# Initialize lists to store train and test accuracies
evaluation_results = []
train_accuracies = []
test_accuracies = []

# Implement GroupKFold
group_kfold = GroupKFold(n_splits=10)
games = df['[control]game'].unique()

# Loop through each game
for game in games:
    game_df = df[df['[control]game'] == game]
    feature_cols = [col for col in df.columns if '[general]' in col]
    X = game_df[feature_cols]
    Y = game_df['arousal_label']
    groups = game_df['[control]player_id']

    train_accuracies_game = []
    test_accuracies_game = []

    # Split the data into training and testing sets based on groups
    for train_index, test_index in group_kfold.split(X, Y, groups):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        group_labels_test = game_df['[control]player_id'].iloc[test_index].values

        # Remove the warnings
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=HessianInversionWarning)

            # Ordinal Logistic Regression using OrderedModel from stastmodels
            model = OrderedModel(Y_train, X_train, distr='logit')
            result = model.fit(method='bfgs', disp=False)
            
            # Prediction
            prob_train = result.predict(X_train)
            prob_test = result.predict(X_test)
            linpred_test = result.predict(X_test, which='linpred')

            Y_pred_train = np.argmax(prob_train.values, axis=1) + 1
            Y_pred_test = np.argmax(prob_test.values, axis=1) + 1

            train_accuracies_game.append(accuracy_score(Y_train, Y_pred_train) * 100)
            test_accuracies_game.append(accuracy_score(Y_test, Y_pred_test) * 100)

            individual_results_df = evaluate_individual_performance(X_test, Y_test, group_labels_test, linpred_test, game)
            evaluation_results.append(individual_results_df)
    
    # Store average accuracies for this game
    train_accuracies.append(np.mean(train_accuracies_game))
    test_accuracies.append(np.mean(test_accuracies_game))

output_folder = 'AGAIN Ranking Algorithms/Evaluation/Ordinal Logistic Regression'
output_file = 'OLR_evaluation_result.csv'
combined_results_df = pd.concat(evaluation_results, ignore_index=True)
combined_results_df.to_csv(os.path.join(output_folder, output_file), index=False)

print("Combined individual performance evaluation results saved.")

print("\nAverage Training and Testing Accuracies by Game:")
for i, game in enumerate(games):
    print(f"{game}: Training Accuracy = {train_accuracies[i]:.2f}%, Testing Accuracy = {test_accuracies[i]:.2f}%")

# Calculate and print the overall average accuracies
overall_train_accuracy = np.mean(train_accuracies)
overall_test_accuracy = np.mean(test_accuracies)
print(f"\nOverall Average Training Accuracy: {overall_train_accuracy:.2f}%")
print(f"Overall Average Testing Accuracy: {overall_test_accuracy:.2f}%")

The csv file with percentile labels has been created successfully.
Combined individual performance evaluation results saved.

Average Training and Testing Accuracies by Game:
Heist!: Training Accuracy = 4.13%, Testing Accuracy = 4.01%
Shootout: Training Accuracy = 5.00%, Testing Accuracy = 4.80%
TopDown: Training Accuracy = 4.13%, Testing Accuracy = 3.99%

Overall Average Training Accuracy: 4.42%
Overall Average Testing Accuracy: 4.27%


## Regression

In [17]:
input_file = os.path.join('AGAIN Ranking Algorithms', 'intervals_data.csv')
df = pd.read_csv(input_file)

group_kfold = GroupKFold(n_splits=10)
games = df['[control]game'].unique()

linear_results = []
random_forest_results = []
mlp_results = []

def concordance_correlation_coefficient(y_true, y_pred):
    cor = np.corrcoef(y_true, y_pred)[0][1]
    mean_true, mean_pred = np.mean(y_true), np.mean(y_pred)
    var_true, var_pred = np.var(y_true), np.var(y_pred)
    sd_true, sd_pred = np.std(y_true), np.std(y_pred)
    numerator = 2 * cor * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred)**2
    ccc = numerator / denominator
    return ccc

def pearson_correlation_coefficient(y_true, y_pred):
    pcc, _ = pearsonr(y_true, y_pred)
    return pcc

def kendalls_tau_coefficient(y_true, y_pred):
    tau, _ = kendalltau(y_true, y_pred)
    return tau

def evaluation_function(y_test, group_labels, predictions, results_list, game_name):
    for participant_id in np.unique(group_labels):
        idx = group_labels == participant_id
        pcc = pearsonr(y_test[idx], predictions[idx])[0]
        ccc = concordance_correlation_coefficient(y_test[idx], predictions[idx])
        tau = kendalltau(y_test[idx], predictions[idx])[0]
        results_list.append({
            'Game Name': game_name,
            'Participant ID': participant_id,
            'PCC': pcc,
            'CCC': ccc,
            'KendallTau': tau
        })

def calculate_averages(results_list):
    game_results = pd.DataFrame(results_list)
    numeric_results = game_results.select_dtypes(include=[np.number])
    game_average = numeric_results.mean()
    results_list.append({
        'Participant ID': 'Average',
        'PCC': game_average['PCC'],
        'CCC': game_average['CCC'],
        'KendallTau': game_average['KendallTau']
    })

for game in games:
    game_df = df[df['[control]game'] == game]
    exclude_columns = ['[general]time_passed'] 
    feature_cols = [col for col in df.columns if '[general]' in col and col not in exclude_columns]
    X = game_df[feature_cols]
    y_continuous = game_df['[output]arousal']
    groups = game_df['[control]player_id']

    for train_index, test_index in group_kfold.split(X, y_continuous, groups):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y_continuous.iloc[train_index], y_continuous.iloc[test_index]
        group_labels_test = groups.iloc[test_index].values

        reg_lin = LinearRegression().fit(X_train, y_train)
        y_lin_pred = reg_lin.predict(X_test)
        evaluation_function(y_test, group_labels_test, y_lin_pred, linear_results, game)

        reg_rf = RandomForestRegressor(random_state=0).fit(X_train, y_train)
        y_rf_pred = reg_rf.predict(X_test)
        evaluation_function(y_test, group_labels_test, y_rf_pred, random_forest_results, game)

        reg_mlp = MLPRegressor(hidden_layer_sizes=(64, 32), random_state=1, max_iter=1000).fit(X_train, y_train)
        y_mlp_pred = reg_mlp.predict(X_test)
        evaluation_function(y_test, group_labels_test, y_mlp_pred, mlp_results, game)

    calculate_averages(linear_results)
    calculate_averages(random_forest_results)
    calculate_averages(mlp_results)

pd.DataFrame(linear_results).to_csv('AGAIN Ranking Algorithms/Evaluation/Regression/linear_regression_evaluation_results.csv', index=False)
pd.DataFrame(random_forest_results).to_csv('AGAIN Ranking Algorithms/Evaluation/Regression/random_forest_evaluation_results.csv', index=False)
pd.DataFrame(mlp_results).to_csv('AGAIN Ranking Algorithms/Evaluation/Regression/mlp_evaluation_results.csv', index=False)