In [431]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, r2_score
from scipy.stats import spearmanr


In [432]:
df = pd.read_csv("dataset_with_scores.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df["Query_encoded"] = df["Query"].astype("category").cat.codes
df["Name_encoded"] = df["Name"].astype("category").cat.codes

df['Score_label'] = (df['Normalized_Score'] * 10).astype(int) 

print(df.head())

                     Query LOINC Code  \
0      bilirubin in plasma     1971-1   
1         glucose in blood     1920-8   
2         glucose in blood     1751-7   
3  white blood cells count     2069-3   
4      bilirubin in plasma    54439-5   

                                      Name                     Component  \
0          bilirubin indirect serum plasma  bilirubin non glucuronidated   
1  aspartate aminotransferase serum plasma    aspartate aminotransferase   
2                     albumin serum plasma                       albumin   
3                           chloride blood                      chloride   
4         calcium bilirubinate total stone    calcium bilirubinate total   

         System                 Property                Measurement  \
0  serum plasma       mass concentration                mass volume   
1  serum plasma       cell concentration  enzymatic activity volume   
2  serum plasma       mass concentration                mass volume   
3         bl

In [433]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)


X_train = train_data[[
    "Query_encoded",   
    "Name_encoded"
]]
y_train = train_data["Score_label"]
q_train = train_data.groupby("Query_encoded").size().values  

X_test = test_data[[
    "Query_encoded",   
    "Name_encoded"
]]
y_test = test_data["Score_label"]
q_test = test_data.groupby("Query_encoded").size().values  

train_data = lgb.Dataset(X_train, label=y_train, group=q_train)
test_data = lgb.Dataset(X_test, label=y_test, group=q_test, reference=train_data)

print(X_train.tail())
print(X_test.tail())

     Query_encoded  Name_encoded
106              1            61
14               2            52
92               0            30
179              2            61
102              2            51
     Query_encoded  Name_encoded
135              2            65
137              0            44
164              1            31
76               1            16
79               0             5


In [None]:
params = {
    "objective": "lambdarank",
    "metric": "ndcg",  
    "boosting_type": "gbdt",
    "num_leaves": 20,
    "learning_rate": 0.01,
    "max_depth": 20,
    "verbosity": -1,
    "lambda_l1": 0.05,
    "lambda_l2": 0.05,
    "colsample_bytree": 0.9
}


model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=10000)


In [435]:
y_pred = model.predict(X_test)  

scaler = MinMaxScaler(feature_range=(0, 1))
y_pred = scaler.fit_transform(y_pred.reshape(-1, 1)).flatten()

df_test = X_test.copy()

df_test["Predicted_Score"] = y_pred
df_test["Actual_Score"] = df.loc[X_test.index, "Normalized_Score"]
df_test["Name"] = df.loc[X_test.index, "Name"]
df_test["Query"] = df.loc[X_test.index, "Query"]

df_test = df_test[["Query", "Name", "Predicted_Score", "Actual_Score"]]

df_test = df_test.sort_values(by=["Query", "Predicted_Score"], ascending=[True, False])

df_test.to_csv("ranked_results.csv", index=False)

print("Ranked results saved to 'ranked_results.csv'")


Ranked results saved to 'ranked_results.csv'


In [436]:
y_true = df_test.loc[X_test.index, "Actual_Score"]  

margin = 0.1

y_pred_adjusted = np.abs(y_pred - y_true) <= margin  
y_true_adjusted = np.abs(y_true - y_true) <= margin  

accuracy = accuracy_score(y_true_adjusted, y_pred_adjusted)
f1 = f1_score(y_true_adjusted, y_pred_adjusted)

mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
spearman_corr, _ = spearmanr(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print(f"Spearman's Rank Correlation: {spearman_corr:.4f}")


Accuracy: 0.5122
F1 Score: 0.6774
Mean Squared Error (MSE): 0.0385
R-squared (R²): 0.1863
Spearman's Rank Correlation: 0.4294
