In [1]:
import pandas as pd

# import polars as pl
import numpy as np
import xgboost
import os
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from functools import reduce
import pkg_resources
import types


In [2]:
from src.metrics import evaluate_models, get_final_predictions, train_models,smape1
from src.params import (
    NUM_CANDIDATES,
    INPUT_DIR,
    KAGGLE_INFERENCE,
    TRAIN_CLINICAL_FILE,
    TRAIN_PEPTIDES_FILE,
    TRAIN_PROTEINS_FILE,
    MODELS_DICT,
    FEATURES_MONTH,
    MODEL_USE,
    SVR_PARAMS
)
from src.utils import (
    analyze_patients,
    categorize_patients,
    categorize_test_patients,
    create_train_test_split,
    create_X_y_dict,
    get_peptide_candidates,
    get_protein_candidates,
    preprocessing_data,
    print_requirements,
)

In [3]:
train_clinical_df = pd.read_csv(os.path.join(INPUT_DIR, TRAIN_CLINICAL_FILE ))
train_peptides = pd.read_csv(os.path.join(INPUT_DIR,   TRAIN_PEPTIDES_FILE))
train_proteins = pd.read_csv(os.path.join(INPUT_DIR,  TRAIN_PROTEINS_FILE))


In [4]:
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package,
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system names
        if name == "PIL":
            name = "Pillow"
        elif name == "sklearn":
            name = "scikit-learn"

        yield name


imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name != "pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

tqdm==4.67.1
xgboost==2.0.0
numpy==1.24.4
pandas==1.5.0
scikit-learn==1.3.1


In [5]:
# get peptide and protein features, which at the end are not used because of the lack of predictive power
peptide_candidates =  get_peptide_candidates(train_peptides, num_candidates=NUM_CANDIDATES)
protein_candidates = get_protein_candidates(train_proteins, num_candidates=NUM_CANDIDATES)
train_df, FEATURES_ewn, features = preprocessing_data(
    train_clinical_df,
    train_peptides,
    train_proteins,
    peptide_candidates,
    protein_candidates,
)
print("train_df:")
display(
    train_df.head(10).style.set_properties(
        **{
            "background-color": "#212636",
            "color": "white",
            "border": "1.5px solid white",
        }
    )
)

train_df:


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,P01861,P16152,P30086,P98160,Q8IWV7,ADDKETC(UniMod_4)FAEEGK,LFDSDPITVTVPVEVSR,LPPTSAHGNVAEGETKPDPDVTER,SC(UniMod_4)SPELQQK,TTPPVLDSDGSFFLYSK
0,55_0,55,0,10.0,6.0,15.0,,,18430.1,47171.0,89747.3,21970.1,57246.2,70412.0,130107.0,129775.0,47171.0,80617.8
1,55_3,55,3,10.0,7.0,25.0,,,,,,,,,,,,
2,55_6,55,6,8.0,10.0,34.0,,,31112.4,44669.0,72686.0,20700.7,158574.0,63052.4,288345.0,874897.0,44669.0,78095.3
3,55_9,55,9,8.0,9.0,30.0,0.0,On,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,On,22094.8,44159.5,86064.9,19547.8,152944.0,72325.0,282297.0,828847.0,44159.5,81248.8
5,55_18,55,18,7.0,13.0,38.0,0.0,On,,,,,,,,,,
6,55_24,55,24,16.0,9.0,49.0,0.0,On,,,,,,,,,,
7,55_30,55,30,14.0,13.0,49.0,0.0,On,,,,,,,,,,
8,55_36,55,36,17.0,18.0,51.0,0.0,On,21304.2,48076.2,90811.9,21385.7,177998.0,537430.0,71400.1,960251.0,48076.2,6875.79
9,55_42,55,42,12.0,20.0,41.0,0.0,On,,,,,,,,,,


In [6]:
#create one dataframe for each target (updrs_1, updrs_2, updrs_3, updrs_4), for each horizon ( 4, 6,12)
X_dict, y_dict = create_X_y_dict(train_df)

Creating datasets: 100%|██████████| 4/4 [00:26<00:00,  6.67s/it]


In [7]:
# divide train patients in 2 splits based on nº of visits of the patients
patients_array = []
for key, X in X_dict.items():
    X = X_dict[key]
    print(
        key,
        X_dict[key].shape,
        len(X_dict[key].patient_id.unique()),
        X_dict[key].visit_month.unique(),
    )
    patients = X.patient_id.unique()
    patients_array.append(patients)
intersection = reduce(lambda a, b: set(a).intersection(set(b)), patients_array)
union = reduce(lambda a, b: set(a).union(set(b)), patients_array)

intersection = list(intersection)
intersection.sort()
len(intersection), len(union)

updrs_1_plus_0_months (2614, 14) 248 [  0   3   6   9  12  18  24  30  36  42  48  54  60  72  84  96 108]
updrs_1_plus_6_months (1681, 14) 197 [ 0  3  6 12 18 24 30 36 42 48 54]
updrs_1_plus_12_months (1881, 14) 248 [ 0  6 12 18 24 30 36 42 48 60 72 84 96]
updrs_1_plus_24_months (1473, 14) 246 [ 0  6 12 18 24 30 36 48 60 72 84]
updrs_2_plus_0_months (2613, 14) 248 [  0   3   6   9  12  18  24  30  36  42  48  54  60  72  84  96 108]
updrs_2_plus_6_months (1680, 14) 197 [ 0  3  6 12 18 24 30 36 42 48 54]
updrs_2_plus_12_months (1881, 14) 248 [ 0  6 12 18 24 30 36 42 48 60 72 84 96]
updrs_2_plus_24_months (1473, 14) 246 [ 0  6 12 18 24 30 36 48 60 72 84]
updrs_3_plus_0_months (2590, 14) 248 [  0   3   6   9  12  18  24  30  36  42  48  54  60  72  84  96 108]
updrs_3_plus_6_months (1655, 14) 197 [ 0  3  6 12 18 24 30 36 42 48 54]
updrs_3_plus_12_months (1848, 14) 248 [ 0  6 12 18 24 30 36 42 48 60 72 84 96]
updrs_3_plus_24_months (1443, 14) 246 [ 0  6 12 18 24 30 36 48 60 72 84]
updrs_4

(167, 248)

In [8]:

patients_train, patients_test= create_train_test_split(X_dict)

patients_train_good, patients_train_bad, patients_good = categorize_patients(patients_train, intersection, X_dict, train_df)

In [9]:


patients_test_good, patients_test_bad= categorize_test_patients(patients_test, train_df, None)

In [10]:
## train models: 1model for each split (good patients and bad patients), for each target updrs_1, updrs_2, updrs_3, updrs_4, and for each horizon (4,6,12)
## i.e 24 models
models = {"bad": {}, "good": {}}

features = FEATURES_MONTH
df_pred = None
for key, X in X_dict.items():
    target = key.rsplit("_", 3)[0]
    y = y_dict[key]
    y["visit_month"] = y["visit_id"].transform(lambda x: x.split("_")[1]).astype(int)

    model = MODELS_DICT[MODEL_USE]["model"](**MODELS_DICT[MODEL_USE]["params"])
    X_train_good = X.loc[X.patient_id.isin(patients_train_good)]
    y_train_good = y.loc[y.patient_id.isin(patients_train_good)]
    model.fit(X_train_good[features], y_train_good[target])

    X_test_good = X.loc[(X.patient_id.isin(patients_test_good)) | (X.visit_month <= 6)]
    y_test_good = y.loc[(y.patient_id.isin(patients_test_good)) | (X.visit_month <= 6)]

    y_pred_good = model.predict(X_test_good[features])
    y_pred_good[y_pred_good < 0] = 0
    y_pred_good = np.round(y_pred_good)
    df_pred = pd.concat(
        [
            df_pred,
            pd.DataFrame(
                {
                    "real": y_test_good[target],
                    "pred": y_pred_good,
                    "key": key.rsplit("_", 3)[0],
                }
            ),
        ]
    )
    score = smape1(y_test_good[target], y_pred_good)
    print(f"Key {key}, score good  {score}")
    print(X_test_good.shape, y_test_good.shape)

    model_bad = MODELS_DICT[MODEL_USE]["model"](**MODELS_DICT[MODEL_USE]["params"])
    X_train_bad = X.loc[X.patient_id.isin(patients_train_bad)]
    y_train_bad = y.loc[y.patient_id.isin(patients_train_bad)]
    model_bad.fit(X_train_bad[features], y_train_bad[target])

    X_test_bad = X.loc[(X.patient_id.isin(patients_test_bad)) & (X.visit_month > 6)]
    y_test_bad = y.loc[(y.patient_id.isin(patients_test_bad)) & (X.visit_month > 6)]
    if len(X_test_bad) > 0:
        if key.rsplit("_", 3)[0] == "updrs_1":
            y_pred_bad = 3
        elif key.rsplit("_", 3)[0] == "updrs_2":
            y_pred_bad = 1
        elif key.rsplit("_", 3)[0] == "updrs_3":
            y_pred_bad = 1
        df_pred = pd.concat(
            [
                df_pred,
                pd.DataFrame(
                    {
                        "real": y_test_bad[target],
                        "pred": y_pred_bad,
                        "key": key.rsplit("_", 3)[0],
                    }
                ),
            ]
        )
        score = smape1(y_test_bad[target], y_pred_bad)
        print(f"Key {key}, score  bad {score}")
        print(X_test_good.shape, y_test_good.shape)

    X_good = X.loc[(X.patient_id.isin(patients_good))]
    y_good = y.loc[(y.patient_id.isin(patients_good))]
    # X=X.loc[X.patient_id.isin(intersection)]
    # y=y.loc[y.patient_id.isin(intersection)]
    model.fit(X[features], y[target])
    models["good"][key] = model
    models["bad"][key] = model_bad

Key updrs_1_plus_0_months, score good  51.78574370883297
(833, 14) (833, 4)
Key updrs_1_plus_0_months, score  bad 56.88963131586083
(833, 14) (833, 4)
Key updrs_1_plus_6_months, score good  50.946652638874056
(669, 14) (669, 4)
Key updrs_1_plus_6_months, score  bad 68.13852813852813
(669, 14) (669, 4)
Key updrs_1_plus_12_months, score good  49.95667070807212
(628, 14) (628, 4)
Key updrs_1_plus_12_months, score  bad 52.939441510870076
(628, 14) (628, 4)
Key updrs_1_plus_24_months, score good  49.83954966107597
(560, 14) (560, 4)
Key updrs_1_plus_24_months, score  bad 50.008150579579144
(560, 14) (560, 4)
Key updrs_2_plus_0_months, score good  64.97296643152275
(832, 14) (832, 4)
Key updrs_2_plus_0_months, score  bad 61.087218792136824
(832, 14) (832, 4)
Key updrs_2_plus_6_months, score good  61.70049599740553
(668, 14) (668, 4)
Key updrs_2_plus_6_months, score  bad 33.33333333333333
(668, 14) (668, 4)
Key updrs_2_plus_12_months, score good  68.09938342860106
(628, 14) (628, 4)
Key updrs

In [11]:
df_pred.loc[df_pred.key == "updrs_4", "pred"] = 0
print(f'Smape with test date {smape1(df_pred["real"], df_pred["pred"])}')

Smape with test date 58.40665518807901


In [12]:
# this is to infer the kagle data
# the notebook runs on a Kaggle instance in a process you cannot see, and you also cannot view the data.
#to submit the predictionss to kagle competition using kagle specific api
if KAGGLE_INFERENCE:
    import amp_pd_peptide

    env = amp_pd_peptide.make_env()
    iter_test = env.iter_test()

    i = 0
    samples = []
    test_clinical_df_acumulated = None
    train_peptides_df_acumulated = None
    train_proteins_df_acumulated = None
    for (
        test_clinical_df,
        train_peptides_df,
        train_proteins_df,
        sample_submission_df,
    ) in iter_test:
        print(f"Iteration {i}")
        test_clinical_df_acumulated = pd.concat([test_clinical_df_acumulated, test_clinical_df])
        train_peptides_df_acumulated = pd.concat([train_peptides_df_acumulated, train_peptides_df])
        train_proteins_df_acumulated = pd.concat([train_proteins_df_acumulated, train_proteins_df])
        sample_submission_df_old = sample_submission_df.copy()
        sample_submission_df["updrs_test"] = sample_submission_df["prediction_id"].transform(
            lambda x: x.split("_plus")[0].split("_", 2)[2]
        )
        sample_submission_df["visit_id"] = sample_submission_df["prediction_id"].transform(
            lambda x: x.rsplit("_", 5)[0]
        )
        sample_submission_df["model_key"] = sample_submission_df["prediction_id"].transform(
            lambda x: x.split("_", 2)[2]
        )
        sample_submission_df = sample_submission_df.drop("rating", axis=1)
        test_df, _, _ = preprocessing_data(
            test_clinical_df_acumulated,
            train_peptides_df_acumulated,
            train_proteins_df_acumulated,
            peptide_candidates,  # defined in training
            protein_candidates,  # defined in training
            train=False,
        )
        if test_clinical_df.visit_month.unique().min() > 6:
            groups_test = test_df.groupby(["patient_id", "visit_month"]).size()
            index = pd.MultiIndex.from_product(
                [
                    groups_test.index.get_level_values(0),
                    groups_test.index.get_level_values(1),
                ],
                names=groups_test.index.names,
            )
            groups_test = groups_test.reindex(index, fill_value=0).reset_index().rename(columns={0: "count"})
            patients_test_bad = groups_test.query("count==0 and visit_month==6").patient_id.unique()
            patients_test_good = list(set(test_df.patient_id.unique()) - set(patients_test_bad))
        else:
            patients_test_good = test_df.patient_id.unique()
            patients_test_bad = []

        df_predict = None
        for key, df in sample_submission_df.groupby("model_key"):
            df = pd.merge(df, test_df, on=["updrs_test", "visit_id"])
            if key.rsplit("_", 3)[0] != "updrs_4":
                df_good = df.loc[df.patient_id.isin(patients_test_good)]
                df_bad = df.loc[df.patient_id.isin(patients_test_bad)]
                model_good = models["good"][key]
                model_bad = models["bad"][key]
                if len(df_good) > 0:
                    df_good["rating"] = model_good.predict(df_good[features])
                    mask = df_good["rating"] < 0
                    df_good.loc[mask, "rating"] = 0
                    df_good["rating"] = np.round(df_good["rating"])
                if len(df_bad) > 0:
                    if key.rsplit("_", 3)[0] == "updrs_1":
                        df_bad["rating"] = 3
                    elif key.rsplit("_", 3)[0] == "updrs_2":
                        df_bad["rating"] = 1
                    elif key.rsplit("_", 3)[0] == "updrs_3":
                        df_bad["rating"] = 1
                df = pd.concat([df_good, df_bad])
            else:
                print(f"Just 0 for {key}")
                df["rating"] = 0
            df_predict = pd.concat([df_predict, df[["prediction_id", "rating"]]])

        sample_submission_df = pd.merge(sample_submission_df, df_predict, on="prediction_id", how="left").drop(
            ["updrs_test", "visit_id", "model_key"], axis=1
        )
        sample_submission_df = sample_submission_df.fillna(0)
        env.predict(sample_submission_df)

        i += 1