# LambdaMART



In [None]:
# pip install lightgbm

In [2]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from scipy.stats import pearsonr, kendalltau
from sklearn.model_selection import GroupKFold


In [3]:
project_directory = r'C:\Users\marco\OneDrive\Desktop\Final Year Project'
os.chdir(project_directory)
base_dir = os.getcwd() 

In [4]:

# To print the evaluation results
def print_evaluation_callback(period=1, show_stdv=True):
    def callback(env):
        if period > 0 and env.iteration % period == 0:
            result = ''
            for data_name, eval_name, value, _ in env.evaluation_result_list:
                if show_stdv:
                    result += f"{data_name}'s {eval_name}: {value:.6f} "
                else:
                    result += f"{data_name}'s {eval_name}: {value} "
            print(result)
    callback.order = 10
    return callback

input_file = os.path.join('AGAIN Ranking Algorithms','Data_Percentiles', 'ordinal_logistic_regression_percentiles.csv')

# Load data
df = pd.read_csv(input_file)

def concordance_correlation_coefficient(y_true, y_pred):
    cor = np.corrcoef(y_true, y_pred)[0][1]
    mean_true, mean_pred = np.mean(y_true), np.mean(y_pred)
    var_true, var_pred = np.var(y_true), np.var(y_pred)
    sd_true, sd_pred = np.std(y_true), np.std(y_pred)
    numerator = 2 * cor * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred)**2
    return numerator / denominator

def pearson_correlation_coefficient(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]

def kendalls_tau_coefficient(y_true, y_pred):
    return kendalltau(y_true, y_pred)[0]

def evaluate_individual_performance(X_test, y_test, groups_test, ranker):
    evaluation_results = []
    for group in np.unique(groups_test):
        indices = groups_test == group
        X_test_group, y_test_group = X_test[indices], y_test[indices]

        # Compute the mean of features for normalization
        X_test_group_mean = X_test_group.mean(axis=0)
        X_test_group_normalized = X_test_group - X_test_group_mean

        # Predict scores for the current participant's data
        predicted_scores = ranker.predict(X_test_group_normalized)

        # Calculate evaluation metrics
        pcc = pearson_correlation_coefficient(y_test_group, predicted_scores)
        ccc = concordance_correlation_coefficient(y_test_group, predicted_scores)
        kendall_tau = kendalls_tau_coefficient(y_test_group, predicted_scores)

        # Store the results
        evaluation_results.append({
            'Participant ID': group,
            'PCC': pcc,
            'CCC': ccc,
            'KendallTau': kendall_tau
        })

    results_df = pd.DataFrame(evaluation_results)
    return results_df

feature_cols = [col for col in df.columns if '[general]' in col]
X = df[feature_cols].values
y = df['arousal_label'].values
groups = df['[control]player_id'].values

eval_results = []
all_evaluation_results = []

# Iterate over games
games = df['[control]game'].unique()

group_kfold = GroupKFold(n_splits=10)

for game in df['[control]game'].unique():
    game_df = df[df['[control]game'] == game]
    X_game = game_df[feature_cols].values
    y_game = game_df['arousal_label'].values
    groups_game = game_df['[control]player_id'].values

    
    for train_index, test_index in group_kfold.split(X_game, y_game, groups_game):
        X_train, X_test = X_game[train_index], X_game[test_index]
        y_train, y_test = y_game[train_index], y_game[test_index]
        groups_train, groups_test = groups_game[train_index], groups_game[test_index]

        ranker = lgb.LGBMRanker(
                    objective="lambdarank",
                    boosting_type="gbdt",
                    n_estimators=10,
                    importance_type="gain",
                    metric="ndcg",
                    num_leaves=10,
                    learning_rate=0.05,
                    force_col_wise = True,
                    max_depth=-1,
                    label_gain=[i for i in range(max(y_train.max(), y_test.max()) + 1)]
                )

        train_group = np.array([np.sum(groups_train == g) for g in np.unique(groups_train)])
        test_group = np.array([np.sum(groups_test == g) for g in np.unique(groups_test)])

        ranker.fit(
            X=X_train,
            y=y_train,
            group=train_group,
            eval_set=[(X_test, y_test)],
            eval_group=[test_group],
            eval_at=[4, 8,15],
            callbacks=[print_evaluation_callback(period=1)]
        )

        # Collect evaluation results
        print("========================================================================================")
        eval_result = ranker.best_score_['valid_0']['ndcg@4'], ranker.best_score_['valid_0']['ndcg@8'], ranker.best_score_['valid_0']['ndcg@15']
        eval_results.append(eval_result)

        individual_results = evaluate_individual_performance(X_test, y_test, groups_test, ranker)
        individual_results['Game'] = game
        all_evaluation_results.append(individual_results)

output_base = os.path.join('AGAIN Ranking Algorithms','Evaluation', 'LambdaMart')


individual_results_csv = os.path.join(output_base, 'individual_evaluation_results.csv')
game_averages_csv = os.path.join(output_base, 'averages_evaluation_results.csv')

# Concatenate and save all individual evaluations
final_results_df = pd.concat(all_evaluation_results, ignore_index=True)
final_results_df.to_csv(individual_results_csv, index=False)

print("Final individual evaluation results saved.")

# Calculate average NDCG scores at 4, 8 and 15
avg_ndcg_at_4 = np.mean([result[0] for result in eval_results])
avg_ndcg_at_8 = np.mean([result[1] for result in eval_results])
avg_ndcg_at_15 = np.mean([result[2] for result in eval_results])

print(f"Average NDCG@4 Score: {avg_ndcg_at_4:.4f}")
print(f"Average NDCG@8 Score: {avg_ndcg_at_8:.4f}")
print(f"Average NDCG@15 Score: {avg_ndcg_at_15:.4f}")


game_averages = []

# Iterate over each game to calculate the mean of the evaluation metrics
game_averages = []
for game in games:
    game_subset = final_results_df[final_results_df['Game'] == game]
    avg_pcc = game_subset['PCC'].mean()
    avg_ccc = game_subset['CCC'].mean()
    avg_kendall_tau = game_subset['KendallTau'].mean()

    game_averages.append({
        'Game': game,
        'Average PCC': avg_pcc,
        'Average CCC': avg_ccc,
        'Average KendallTau': avg_kendall_tau
    })

# Convert to DataFrame
game_averages_df = pd.DataFrame(game_averages)
game_averages_df.to_csv(game_averages_csv, index=False)

print("Game averages evaluation results saved.")


found 0 physical cores < 1
  File "c:\Users\marco\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


[LightGBM] [Info] Total Bins 2477
[LightGBM] [Info] Number of data points in the train set: 4042, number of used features: 11
valid_0's ndcg@4: 0.569734 valid_0's ndcg@8: 0.608355 valid_0's ndcg@15: 0.706280 
valid_0's ndcg@4: 0.669138 valid_0's ndcg@8: 0.751442 valid_0's ndcg@15: 0.755153 
valid_0's ndcg@4: 0.705482 valid_0's ndcg@8: 0.761335 valid_0's ndcg@15: 0.781411 
valid_0's ndcg@4: 0.702994 valid_0's ndcg@8: 0.733086 valid_0's ndcg@15: 0.767057 
valid_0's ndcg@4: 0.737140 valid_0's ndcg@8: 0.751213 valid_0's ndcg@15: 0.778335 
valid_0's ndcg@4: 0.725222 valid_0's ndcg@8: 0.753276 valid_0's ndcg@15: 0.782463 
valid_0's ndcg@4: 0.707542 valid_0's ndcg@8: 0.754682 valid_0's ndcg@15: 0.781217 
valid_0's ndcg@4: 0.740331 valid_0's ndcg@8: 0.772492 valid_0's ndcg@15: 0.795134 
valid_0's ndcg@4: 0.718476 valid_0's ndcg@8: 0.767949 valid_0's ndcg@15: 0.797729 
valid_0's ndcg@4: 0.725247 valid_0's ndcg@8: 0.773714 valid_0's ndcg@15: 0.799091 
[LightGBM] [Info] Total Bins 2475
[LightGBM]