In [1419]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation, reset_parameter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os



### Data processing

1. **Reading the CSV File:**
    - The code reads a CSV file into a Pandas DataFrame.

2. **Shuffling the Data:**
    - It shuffles the DataFrame's rows randomly using `sample()` and resets the index.

3. **Encoding Categorical Columns:**
    - A loop encodes several categorical columns (e.g., "Query", "Name", etc.) into integer values and creates new columns with the encoded values.

4. **Creating a 'Score_label' Column:**
    - It generates a new `Score_label` column by scaling and converting the `Normalized_Score` to integers.

5. **Displaying the Data:**
    - Finally, the first few rows of the updated DataFrame are printed for verification.


In [1420]:
df = pd.read_csv("./Datasets_with_scores/dataset_enhanced_5.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

for col in ["Query", "Name", "Component", "System", "Property", "Measurement"]:
    df[f"{col}_encoded"] = df[col].astype("category").cat.codes

df['Score'] = (df['Normalized_Score'] * 10).astype(int).clip(0, 4) 

print(df.head())

                     Query LOINC Code  \
0         calcium in serum   104877-6   
1  white blood cells count    14859-3   
2         glucose in blood     2612-0   
3      bilirubin in plasma    54352-0   
4           cells in urine    12733-2   

                                                Name  \
0  hematocrit volume fraction blood 1 5 hour post...   
1  leukocyte leukocyte peritoneal fluid manual count   
2             methemalbumin mass volume serum plasma   
3                         nitrogen mass volume urine   
4    tryptase enzymatic activity volume serum plasma   

                           Component         System            Property  \
0  hematocrit 1 5h post dose glucose          blood                 vfr   
1                leukocyte leukocyte  periton fluid                 nfr   
2                      methemalbumin   serum plasma  mass concentration   
3                           nitrogen          urine  mass concentration   
4                           tryptase   se

### Data Split

1. **Splitting the Data:**
    - The `train_test_split()` function is used to split the DataFrame `df` into training and testing sets.
    - 80% of the data is used for training (`train_data`), and 20% is reserved for testing (`test_data`).

2. **Displaying Data Sizes:**
    - The sizes of the training and testing sets are printed using `shape[0]`, which gives the number of rows in each dataset.


In [1421]:
train_data, test_data = train_test_split(df, test_size=0.25, random_state=42)

print("Train data size: ", train_data.shape[0])
print("Test data size: ", test_data.shape[0])


Train data size:  21603
Test data size:  7202


### Data preparation for trainning

1. **Defining Features:**
    - A list of feature columns (`features`) is created, which includes the encoded versions of categorical columns (e.g., "Query_encoded", "Name_encoded", etc.).

2. **Preparing Training Data:**
    - `X_train` contains the feature values from `train_data`.
    - `y_train` contains the target values (`Score_label`) from `train_data`.
    - `q_train` calculates the size of each group based on the "Query_encoded" column using `groupby()`.

3. **Preparing Testing Data:**
    - `X_test` contains the feature values from `test_data`.
    - `y_test` contains the target values (`Score_label`) from `test_data`.
    - `q_test` calculates the size of each group based on the "Query_encoded" column for the test data.

4. **Creating LightGBM Datasets:**
    - `train_data` and `test_data` are converted into LightGBM datasets (`lgb.Dataset`), which are required for training a LightGBM model. These datasets include the feature data (`X_train`, `X_test`), target labels (`y_train`, `y_test`), and group sizes (`q_train`, `q_test`).

5. **Displaying Training and Testing Data:**
    - The first few rows of `X_train` and `X_test` are printed to verify the data.


In [1422]:
features = [
    "Query_encoded", "Name_encoded", "Component_encoded",
    "System_encoded", "Property_encoded", "Measurement_encoded"
]

X_train = train_data[features]
y_train = train_data["Score"]
q_train = train_data.groupby("Query_encoded").size().values  

X_test = test_data[features]
y_test = test_data["Score"]
q_test = test_data.groupby("Query_encoded").size().values  

train_data = lgb.Dataset(X_train, label=y_train, group=q_train, free_raw_data=False)
test_data = lgb.Dataset(X_test, label=y_test, group=q_test, reference=train_data, free_raw_data=False)

print(X_train.head())
print(X_test.head())

       Query_encoded  Name_encoded  Component_encoded  System_encoded  \
7417               1          4465               2111              32   
11031              4           813                371             113   
24328              4          1222                561             150   
14999              2          3003               1505             113   
5504               1          2926               1396             113   

       Property_encoded  Measurement_encoded  
7417                 35                   -1  
11031                24                   22  
24328                35                   -1  
14999                24                   22  
5504                 24                   22  
       Query_encoded  Name_encoded  Component_encoded  System_encoded  \
6011               0          4466               2111              32   
26323              2           493                214             140   
18149              1          1644                774       

### Model architecture

1. **Setting Hyperparameters:**
    - The model uses `rank_xendcg` as the objective and `ndcg` as the evaluation metric
    - A dictionary `params` is defined to specify the hyperparameters for training a LightGBM model. These include:
        - `objective`: The type of learning task ("rank_xendcg" for ranking tasks).
        - `metric`: Evaluation metric used ("ndcg" for normalized discounted cumulative gain).
        - `boosting_type`: Boosting method ("gbdt" for Gradient Boosting Decision Trees).
        - `num_leaves`: Number of leaves in the tree.
        - `learning_rate`: Step size for each iteration.
        - `max_depth`: Maximum depth of the tree.
        - `verbosity`: Controls the amount of output during training.
        - `lambda_l1` and `lambda_l2`: L1 and L2 regularization terms.
        - `colsample_bytree`: Fraction of features to be used for each tree.
        - `label_gain`: Prioritizes higher-ranking samples.
        - `n_estimators`: sets a high number of rounds for better performance

2. **Adaptive Training with AdaRank:**
    
    Since LightGBM doesn’t directly support *AdaRank*, this code simulates its boosting and reweighting mechanism using *LightGBM's* as the base model.  
    - The model trains over multiple iterations (`n_iterations`), updating weights based on prediction errors.  
    - Misclassified samples get higher weights, forcing the model to focus on harder cases.  
    - **Early stopping** prevents overfitting, while **log evaluation** tracks performance every 100 rounds.  


In [1423]:
base_params = {
    "objective": "rank_xendcg",  
    "metric": "ndcg",
    "boosting_type": "gbdt",
    "num_leaves": 31,  
    "learning_rate": 0.005,  
    "max_depth": 7,  
    "verbosity": -1,
    "lambda_l1": 0.001,  
    "lambda_l2": 0.001,
    "colsample_bytree": 0.9,  
    "subsample": 0.8,
    "min_child_samples": 20,  
    "min_child_weight": 0.01,
    "max_position": 10,
    "feature_fraction": 0.95,
    "label_gain": [0, 1, 3, 7, 15],
    "n_estimators": 10000,
}

n_iterations = 50
weights = np.ones(len(X_train))

for i in range(n_iterations):
    print(f"AdaRank Iteration {i + 1}")

    model = lgb.train(
        base_params,
        train_data,
        valid_sets=[test_data],
        num_boost_round=500,
        callbacks=[
            early_stopping(stopping_rounds=50),
            log_evaluation(100)
        ]
    )
    
    y_pred = model.predict(X_train)
    errors = np.abs(y_train - y_pred)
    weights *= 1 + (errors / (errors.std() + 1e-6)) * 0.001
    weights = np.log1p(weights)
    weights = (weights - weights.min()) / (weights.max() - weights.min())
    train_data = lgb.Dataset(X_train, label=y_train, group=q_train, weight=weights)



AdaRank Iteration 1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 0.975508
AdaRank Iteration 2
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 0.767888	valid_0's ndcg@3: 0.647153	valid_0's ndcg@4: 0.639226	valid_0's ndcg@5: 0.66032
AdaRank Iteration 3
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
AdaRank Iteration 4
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
AdaRank Iteration 5
Training until validation scores don't improve for 50 rounds
Early

### Test

1. **Making Predictions:**
    - `y_pred = model.predict(X_test)` uses the trained LightGBM model to make predictions on the test data (`X_test`).

2. **Scaling the Predictions:**
    - A `MinMaxScaler` is initialized to scale the predictions (`y_pred`) to a range between 0 and 1.
    - The predictions are reshaped, scaled, and then flattened back to the original shape.

3. **Creating a DataFrame for the Test Data:**
    - `df_test` is created by copying `X_test`.
    - New columns are added to `df_test`:
        - `Predicted Score`: The scaled predictions.
        - `Actual Score`: The actual values from the `Normalized_Score` column in the original DataFrame (`df`).
        - `Name` and `Query`: The corresponding values from the original DataFrame (`df`).

4. **Sorting the Results:**
    - The DataFrame is sorted by `Query` and `Predicted Score` in ascending order by `Query` and descending order by `Predicted Score`.

5. **Saving the Results:**
    - The sorted DataFrame `df_test` is saved to a CSV file called "results.csv".


In [1424]:
output_filename = "./Results/results.csv"
y_pred = model.predict(X_test)  

scaler = MinMaxScaler(feature_range=(0, 1))
y_pred = scaler.fit_transform(y_pred.reshape(-1, 1)).flatten()

df_test = X_test.copy()

df_test["Predicted Score"] = y_pred
df_test["Actual Score"] = df.loc[X_test.index, "Normalized_Score"]
df_test["Name"] = df.loc[X_test.index, "Name"]
df_test["Query"] = df.loc[X_test.index, "Query"]
df_test["LOINC Code"] = df.loc[X_test.index, "LOINC Code"]

df_test = df_test[["LOINC Code", "Query", "Name", "Predicted Score", "Actual Score"]]

df_test = df_test.sort_values(by=["Query", "Predicted Score"], ascending=[True, False])

df_test.to_csv(output_filename, mode='a', index=False, header=not os.path.exists(output_filename))

print(f"Ranked results saved to {output_filename}")


Ranked results saved to ./Results/results.csv
