In [27]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr
from sklearn.metrics import ndcg_score


Enter the result file path to load.

In [28]:
output_filename = "./Results/results_enhanced_5.csv"

### Metrics

1. **Extracting True Scores:**
    - Extract the true scores (`Actual Score`) from the `df_test` DataFrame.
    - Extract the predicted score (`Predicted Score`) from the `df_test` DataFrame.

2. **Adjusting Predictions and True Scores:**
    - A margin of 0.05 is defined to determine how close the predicted values should be to the actual values.
    - `y_pred_adjusted` checks if the absolute difference between predicted and actual scores is within the margin, essentially marking whether the prediction is considered "correct."
    - `y_true_adjusted` checks if the true scores are within the margin of themselves (which will always be `True`, so this step doesn’t affect the results).


3. **Calculating Regression Metrics:**
    - `mean_squared_error (MSE)` calculates the average squared difference between predicted and true values, indicating the overall error of the predictions.
    - `r2_score (R²)` measures the proportion of variance in the true values that is explained by the model, with values closer to 1 indicating better fit.
    - `spearmanr` calculates Spearman’s rank correlation coefficient, measuring the monotonic relationship between predicted and true values. A value close to 1 indicates a strong positive correlation.

4. **Calculating NDCG (Normalized Discounted Cumulative Gain):**
    - The true and predicted scores are grouped by the "Query" column to calculate the ranking scores for each query.
    - The `ndcg_score` is calculated for each query by comparing the true and predicted ranked lists. It measures how well the model's ranking matches the true ranking.
    - The average NDCG score across all queries is then computed using `np.mean()`.
    - It is also shown the individual NDCG score of each query.




In [29]:
df_test = pd.read_csv(output_filename)

if not {"Actual Score", "Predicted Score", "Query"}.issubset(df_test.columns):
    raise ValueError("Missing necessary columns in the dataset!")

y_true = df_test["Actual Score"]
y_pred = df_test["Predicted Score"]

mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
spearman_corr, _ = spearmanr(y_true, y_pred)

y_true_grouped = df_test.groupby("Query")["Actual Score"].apply(list).tolist()
y_pred_grouped = df_test.groupby("Query")["Predicted Score"].apply(list).tolist()

ndcg_scores = [
    ndcg_score(np.array([true]), np.array([pred]))
    for true, pred in zip(y_true_grouped, y_pred_grouped)
]
ndcg_mean = np.mean(ndcg_scores)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print(f"Spearman's Rank Correlation: {spearman_corr:.4f}")
print(f"NDCG Mean Score: {ndcg_mean:.4f}")

queries = df_test["Query"].unique()
for query, true, pred in zip(queries, y_true_grouped, y_pred_grouped):
    ndcg = ndcg_score(np.array([true]), np.array([pred]))
    print(f"- NDCG for '{query}': {ndcg:.4f}")

Mean Squared Error (MSE): 0.0191
R-squared (R²): -0.6009
Spearman's Rank Correlation: 0.4615
NDCG Mean Score: 0.9517
- NDCG for 'bilirubin in plasma': 0.9499
- NDCG for 'calcium in serum': 0.9637
- NDCG for 'cells in urine': 0.9448
- NDCG for 'glucose in blood': 0.9663
- NDCG for 'white blood cells count': 0.9339
