In [935]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation, reset_parameter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr
from sklearn.metrics import ndcg_score
import os



### Data processing

1. **Reading the CSV File:**
    - The code reads a CSV file into a Pandas DataFrame.

2. **Shuffling the Data:**
    - It shuffles the DataFrame's rows randomly using `sample()` and resets the index.

3. **Encoding Categorical Columns:**
    - A loop encodes several categorical columns (e.g., "Query", "Name", etc.) into integer values and creates new columns with the encoded values.

4. **Creating a 'Score_label' Column:**
    - It generates a new `Score_label` column by scaling and converting the `Normalized_Score` to integers.

5. **Displaying the Data:**
    - Finally, the first few rows of the updated DataFrame are printed for verification.


In [None]:
df = pd.read_csv("dataset_scores_basic.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

for col in ["Query", "Name", "Component", "System", "Property", "Measurement"]:
    df[f"{col}_encoded"] = df[col].astype("category").cat.codes

df['Score_label'] = (df['Normalized_Score'] * 10).astype(int).clip(0, 4) 

print(df.head())

                     Query LOINC Code  \
0         glucose in blood    62245-6   
1           cells in urine    53227-5   
2         glucose in blood      807-8   
3  white blood cells count    26471-3   
4           cells in urine    99865-8   

                                                Name  \
0  nucleated erythrocyte leukocyte ratio blood fe...   
1              leukocyte area cervix wet preparation   
2     leukocyte volume pleural fluid automated count   
3                          leukocyte leukocyte blood   
4  acanthocyte presence urine sediment computer a...   

                         Component       System              Property  \
0  erythrocyte nucleated leukocyte  blood fetus                 ratio   
1                        leukocyte          cvx                 naric   
2                        leukocyte    plr field  number concentration   
3              leukocyte leukocyte        blood                   nfr   
4                      acanthocyte    urine sed    

### Data Split

1. **Splitting the Data:**
    - The `train_test_split()` function is used to split the DataFrame `df` into training and testing sets.
    - 80% of the data is used for training (`train_data`), and 20% is reserved for testing (`test_data`).

2. **Displaying Data Sizes:**
    - The sizes of the training and testing sets are printed using `shape[0]`, which gives the number of rows in each dataset.


In [937]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

print("Train data size: ", train_data.shape[0])
print("Test data size: ", test_data.shape[0])


Train data size:  5779
Test data size:  1445


### Data preparation for trainning

1. **Defining Features:**
    - A list of feature columns (`features`) is created, which includes the encoded versions of categorical columns (e.g., "Query_encoded", "Name_encoded", etc.).

2. **Preparing Training Data:**
    - `X_train` contains the feature values from `train_data`.
    - `y_train` contains the target values (`Score_label`) from `train_data`.
    - `q_train` calculates the size of each group based on the "Query_encoded" column using `groupby()`.

3. **Preparing Testing Data:**
    - `X_test` contains the feature values from `test_data`.
    - `y_test` contains the target values (`Score_label`) from `test_data`.
    - `q_test` calculates the size of each group based on the "Query_encoded" column for the test data.

4. **Creating LightGBM Datasets:**
    - `train_data` and `test_data` are converted into LightGBM datasets (`lgb.Dataset`), which are required for training a LightGBM model. These datasets include the feature data (`X_train`, `X_test`), target labels (`y_train`, `y_test`), and group sizes (`q_train`, `q_test`).

5. **Displaying Training and Testing Data:**
    - The first few rows of `X_train` and `X_test` are printed to verify the data.


In [938]:
features = [
    "Query_encoded", "Name_encoded", "Component_encoded",
    "System_encoded", "Property_encoded", "Measurement_encoded"
]

X_train = train_data[features]
y_train = train_data["Score_label"]
q_train = train_data.groupby("Query_encoded").size().values  

X_test = test_data[features]
y_test = test_data["Score_label"]
q_test = test_data.groupby("Query_encoded").size().values  

train_data = lgb.Dataset(X_train, label=y_train, group=q_train)
test_data = lgb.Dataset(X_test, label=y_test, group=q_test, reference=train_data)

print(X_train.head())
print(X_test.head())

      Query_encoded  Name_encoded  Component_encoded  System_encoded  \
6110              1          1046                273               8   
4962              2           580                182              34   
4152              3           628                199              29   
911               2           646                203              16   
6541              2           928                242              29   

      Property_encoded  Measurement_encoded  
6110                17                   -1  
4962                16                    0  
4152                17                   -1  
911                 21                   -1  
6541                17                   -1  
      Query_encoded  Name_encoded  Component_encoded  System_encoded  \
1593              1           414                138              45   
6952              2            61                  8              14   
1790              3            22                253              14   
434

### Model architecture

1. **Setting Hyperparameters:**
    - A dictionary `params` is defined to specify the hyperparameters for training a LightGBM model. These include:
        - `objective`: The type of learning task ("rank_xendcg" for ranking tasks).
        - `metric`: Evaluation metric used ("ndcg" for normalized discounted cumulative gain).
        - `boosting_type`: Boosting method ("gbdt" for Gradient Boosting Decision Trees).
        - `num_leaves`: Number of leaves in the tree.
        - `learning_rate`: Step size for each iteration.
        - `max_depth`: Maximum depth of the tree.
        - `verbosity`: Controls the amount of output during training.
        - `lambda_l1` and `lambda_l2`: L1 and L2 regularization terms.
        - `colsample_bytree`: Fraction of features to be used for each tree.

2. **Training the LightGBM Model:**
    - The `lgb.train()` function is used to train the LightGBM model with the specified `params`, using `train_data` as the training dataset.
    - `valid_sets=[test_data]` specifies the test dataset to be used for validation during training


In [939]:
params = {
    "objective": "rank_xendcg",  
    "metric": "ndcg",
    "boosting_type": "gbdt",
    "num_leaves": 20,  
    "learning_rate": 0.05,  
    "max_depth": 6,  
    "verbosity": -1,
    "lambda_l1": 0.001,  
    "lambda_l2": 0.001,
    "colsample_bytree": 0.9,  
    "subsample": 0.8,
    "min_child_samples": 20,  
    "min_child_weight": 0.01,
    "max_position": 10,
    "feature_fraction": 0.9,
    "label_gain": [0, 1, 3, 7, 15],  
    "n_estimators": 10000,
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[test_data],
    num_boost_round=5000,
    callbacks=[
        early_stopping(stopping_rounds=500), 
        log_evaluation(100),
        reset_parameter(learning_rate=lambda iter: 0.05 * (0.99 ** iter))
]
)


Training until validation scores don't improve for 500 rounds
[100]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[200]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[300]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[400]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[500]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
Early stopping, best iteration is:
[1]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 0.855575	valid_0's ndcg@3: 0.889463	valid_0's ndcg@4: 0.843038	valid_0's ndcg@5: 0.863632


### Test

1. **Making Predictions:**
    - `y_pred = model.predict(X_test)` uses the trained LightGBM model to make predictions on the test data (`X_test`).

2. **Scaling the Predictions:**
    - A `MinMaxScaler` is initialized to scale the predictions (`y_pred`) to a range between 0 and 1.
    - The predictions are reshaped, scaled, and then flattened back to the original shape.

3. **Creating a DataFrame for the Test Data:**
    - `df_test` is created by copying `X_test`.
    - New columns are added to `df_test`:
        - `Predicted_Score`: The scaled predictions.
        - `Actual_Score`: The actual values from the `Normalized_Score` column in the original DataFrame (`df`).
        - `Name` and `Query`: The corresponding values from the original DataFrame (`df`).

4. **Sorting the Results:**
    - The DataFrame is sorted by `Query` and `Predicted_Score` in ascending order by `Query` and descending order by `Predicted_Score`.

5. **Saving the Results:**
    - The sorted DataFrame `df_test` is saved to a CSV file called "ranked_results.csv".


In [940]:
output_filename = "results.csv"
y_pred = model.predict(X_test)  

scaler = MinMaxScaler(feature_range=(0, 1))
y_pred = scaler.fit_transform(y_pred.reshape(-1, 1)).flatten()

df_test = X_test.copy()

df_test["Predicted Score"] = y_pred
df_test["Actual Score"] = df.loc[X_test.index, "Normalized_Score"]
df_test["Name"] = df.loc[X_test.index, "Name"]
df_test["Query"] = df.loc[X_test.index, "Query"]
df_test["LOINC Code"] = df.loc[X_test.index, "LOINC Code"]

df_test = df_test[["LOINC Code", "Query", "Name", "Predicted Score", "Actual Score"]]

df_test = df_test.sort_values(by=["Query", "Predicted Score"], ascending=[True, False])

df_test.to_csv(output_filename, mode='a', index=False, header=not os.path.exists(output_filename))

print("Ranked results saved to {output_filename}")


Ranked results saved to {output_filename}


### Metrics

1. **Extracting True Scores:**
    - Extract the true scores (`Actual_Score`) from the `df_test` DataFrame corresponding to the indices of `X_test`.

2. **Adjusting Predictions and True Scores:**
    - A margin of 0.05 is defined to determine how close the predicted values should be to the actual values.
    - `y_pred_adjusted` checks if the absolute difference between predicted and actual scores is within the margin, essentially marking whether the prediction is considered "correct."
    - `y_true_adjusted` checks if the true scores are within the margin of themselves (which will always be `True`, so this step doesn’t affect the results).

3. **Calculating Accuracy and F1 Score:**
    - `accuracy_score` calculates the proportion of predictions that are correct based on the margin.
    - `f1_score` calculates the F1 score, which is the harmonic mean of precision and recall. It provides a balance between precision and recall.

4. **Calculating Regression Metrics:**
    - `mean_squared_error (MSE)` calculates the average squared difference between predicted and true values, indicating the overall error of the predictions.
    - `r2_score (R²)` measures the proportion of variance in the true values that is explained by the model, with values closer to 1 indicating better fit.
    - `spearmanr` calculates Spearman’s rank correlation coefficient, measuring the monotonic relationship between predicted and true values. A value close to 1 indicates a strong positive correlation.

5. **Calculating NDCG (Normalized Discounted Cumulative Gain):**
    - The true and predicted scores are grouped by the "Query" column to calculate the ranking scores for each query.
    - The `ndcg_score` is calculated for each query by comparing the true and predicted ranked lists. It measures how well the model's ranking matches the true ranking.
    - The average NDCG score across all queries is then computed using `np.mean()`.




In [941]:
y_true = df_test.loc[X_test.index, "Actual Score"]  

mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
spearman_corr, _ = spearmanr(y_true, y_pred)


y_true_grouped = df_test.groupby("Query")["Actual Score"].apply(list).tolist()
y_pred_grouped = df_test.groupby("Query")["Predicted Score"].apply(list).tolist()

ndcg_scores = [ndcg_score([true], [pred]) for true, pred in zip(y_true_grouped, y_pred_grouped)]
ndcg_mean = np.mean(ndcg_scores) 


print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print(f"Spearman's Rank Correlation: {spearman_corr:.4f}")
print(f"NDCG Mean Score: {ndcg_mean:.4f}")

queries = df_test.groupby("Query").first().index.tolist()

for query, true, pred in zip(queries, y_true_grouped, y_pred_grouped):
    ndcg = ndcg_score([true], [pred])
    print(f"- NDCG for '{query}': {ndcg:.4f}")

Mean Squared Error (MSE): 0.0285
R-squared (R²): -0.0209
Spearman's Rank Correlation: 0.6285
NDCG Mean Score: 0.8845
- NDCG for 'bilirubin in plasma': 0.8615
- NDCG for 'calcium in serum': 0.9149
- NDCG for 'cells in urine': 0.8793
- NDCG for 'glucose in blood': 0.9091
- NDCG for 'white blood cells count': 0.8577
