## Idea:
* Use month trend from [Only Trends](https://www.kaggle.com/code/vitalykudelya/only-trends)
* Divide NPX values of a protein into several groups and find the best shift after month trend predicitons for each group
* Sum predictions from the month trend and the corresponding NPX group shift

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm.auto import tqdm

import plotly.express as px

# import amp_pd_peptide

from scipy.optimize import minimize

## Generate Train Dataset

In [2]:
train_clinical_all = pd.read_csv('./data/train_clinical_data.csv')
print(train_clinical_all.shape)
proteins = pd.read_csv('./data/train_proteins.csv')
print(proteins.shape)
proteins_features = pd.pivot_table(proteins, values='NPX', index='visit_id', columns='UniProt', aggfunc='sum')
display(proteins_features)

train_clinical_all = train_clinical_all.merge(
    proteins_features,
    left_on='visit_id',
    right_index=True,
    how='left'
)
display(train_clinical_all)

(2615, 8)
(232741, 5)


UniProt,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,129048.0,...,,9469.45,94237.6,,23016.0,177983.0,65900.0,15382.0,,19017.40
10053_12,10464.20,435586.0,,,,,197117.0,15099.1,164268.0,108114.0,...,,14408.40,,,28537.0,171733.0,65668.1,,9295.65,25697.80
10053_18,13235.70,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,163776.0,...,317477.0,38667.20,111107.0,,37932.6,245188.0,59986.1,10813.3,,29102.70
10138_12,12600.20,494581.0,9165.06,27193.5,22506.10,6015.90,156313.0,54546.4,204013.0,56725.0,...,557904.0,44556.90,155619.0,14647.90,36927.7,229232.0,106564.0,26077.7,21441.80,7642.42
10138_24,12003.20,522138.0,4498.51,17189.8,29112.40,2665.15,151169.0,52338.1,240892.0,85767.1,...,,47836.70,177619.0,17061.10,25510.4,176722.0,59471.4,12639.2,15091.40,6168.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8699_24,9983.00,400290.0,24240.10,,16943.50,6303.17,77493.6,46435.3,254247.0,138910.0,...,,25690.60,,6859.82,19106.7,121161.0,113872.0,14413.9,28225.50,8062.07
942_12,6757.32,360858.0,18367.60,14760.7,18603.40,1722.77,86847.4,37741.3,212132.0,100519.0,...,45742.3,33518.60,94049.7,13415.70,21324.7,234094.0,82410.4,19183.7,17804.10,12277.00
942_24,,352722.0,22834.90,23393.1,16693.50,1487.91,114772.0,36095.7,185836.0,99183.5,...,180475.0,29770.60,95949.9,11344.40,23637.6,256654.0,76931.9,19168.2,19215.90,14625.60
942_48,11627.80,251820.0,22046.50,26360.5,22440.20,2117.43,82241.9,30146.6,167633.0,84875.1,...,197987.0,29283.80,121696.0,19169.80,16724.9,232301.0,96905.9,21120.9,14089.80,16418.50


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,O00391,O00533,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,55_0,55,0,10.0,6.0,15.0,,,11254.3,732430.0,...,365475.0,35528.0,97005.6,23122.5,60912.6,408698.0,,29758.8,23833.7,18953.5
1,55_3,55,3,10.0,7.0,25.0,,,,,...,,,,,,,,,,
2,55_6,55,6,8.0,10.0,34.0,,,13163.6,630465.0,...,405676.0,30332.6,109174.0,23499.8,51655.8,369870.0,,22935.2,17722.5,16642.7
3,55_9,55,9,8.0,9.0,30.0,0.0,On,,,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,On,15257.6,815083.0,...,303953.0,43026.2,114921.0,21860.1,61598.2,318553.0,65762.6,29193.4,28536.1,19290.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,10589.6,902434.0,...,223136.0,21659.9,116470.0,14073.3,48796.4,320821.0,39046.7,,20198.8,39535.0
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,Off,,,...,,,,,,,,,,
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,Off,,,...,,,,,,,,,,
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,Off,,,...,,,,,,,,,,


In [3]:
train_clinical_all[proteins_features.columns] = train_clinical_all.groupby('patient_id')[proteins_features.columns].fillna(method='ffill')
# ffill = train_clinical_all.groupby('patient_id')[proteins_features.columns].fillna(method='ffill')
# bfill = train_clinical_all.groupby('patient_id')[proteins_features.columns].fillna(method='bfill')
# train_clinical_all[proteins_features.columns] = (ffill + bfill) / 2
display(train_clinical_all)

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,O00391,O00533,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,55_0,55,0,10.0,6.0,15.0,,,11254.3,732430.0,...,365475.0,35528.0,97005.6,23122.5,60912.6,408698.0,,29758.8,23833.7,18953.5
1,55_3,55,3,10.0,7.0,25.0,,,11254.3,732430.0,...,365475.0,35528.0,97005.6,23122.5,60912.6,408698.0,,29758.8,23833.7,18953.5
2,55_6,55,6,8.0,10.0,34.0,,,13163.6,630465.0,...,405676.0,30332.6,109174.0,23499.8,51655.8,369870.0,,22935.2,17722.5,16642.7
3,55_9,55,9,8.0,9.0,30.0,0.0,On,13163.6,630465.0,...,405676.0,30332.6,109174.0,23499.8,51655.8,369870.0,,22935.2,17722.5,16642.7
4,55_12,55,12,10.0,10.0,41.0,0.0,On,15257.6,815083.0,...,303953.0,43026.2,114921.0,21860.1,61598.2,318553.0,65762.6,29193.4,28536.1,19290.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,10589.6,902434.0,...,223136.0,21659.9,116470.0,14073.3,48796.4,320821.0,39046.7,,20198.8,39535.0
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,Off,10589.6,902434.0,...,223136.0,21659.9,116470.0,14073.3,48796.4,320821.0,39046.7,,20198.8,39535.0
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,Off,10589.6,902434.0,...,223136.0,21659.9,116470.0,14073.3,48796.4,320821.0,39046.7,,20198.8,39535.0
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,Off,10589.6,902434.0,...,223136.0,21659.9,116470.0,14073.3,48796.4,320821.0,39046.7,,20198.8,39535.0


In [4]:
train_clinical_all['pred_month'] = train_clinical_all['visit_month']

for plus_month in [6, 12, 24]:
    train_shift = train_clinical_all[['patient_id', 'visit_month', 'pred_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']].copy()
    train_shift['visit_month'] -= plus_month
    train_shift.rename(columns={f'updrs_{i}': f'updrs_{i}_plus_{plus_month}' for i in range(1, 5)}, inplace=True)
    train_shift.rename(columns={'pred_month': f'pred_month_plus_{plus_month}'}, inplace=True)
    train_clinical_all = train_clinical_all.merge(train_shift, how='left', on=['patient_id', 'visit_month'])

train_clinical_all.rename(columns={f'updrs_{i}': f'updrs_{i}_plus_0' for i in range(1, 5)}, inplace=True)
train_clinical_all.rename(columns={'pred_month': f'pred_month_plus_0'}, inplace=True)
train_clinical_all

  train_clinical_all['pred_month'] = train_clinical_all['visit_month']


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1_plus_0,updrs_2_plus_0,updrs_3_plus_0,updrs_4_plus_0,upd23b_clinical_state_on_medication,O00391,O00533,...,pred_month_plus_12,updrs_1_plus_12,updrs_2_plus_12,updrs_3_plus_12,updrs_4_plus_12,pred_month_plus_24,updrs_1_plus_24,updrs_2_plus_24,updrs_3_plus_24,updrs_4_plus_24
0,55_0,55,0,10.0,6.0,15.0,,,11254.3,732430.0,...,12.0,10.0,10.0,41.0,0.0,24.0,16.0,9.0,49.0,0.0
1,55_3,55,3,10.0,7.0,25.0,,,11254.3,732430.0,...,,,,,,,,,,
2,55_6,55,6,8.0,10.0,34.0,,,13163.6,630465.0,...,18.0,7.0,13.0,38.0,0.0,30.0,14.0,13.0,49.0,0.0
3,55_9,55,9,8.0,9.0,30.0,0.0,On,13163.6,630465.0,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,On,15257.6,815083.0,...,24.0,16.0,9.0,49.0,0.0,36.0,17.0,18.0,51.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,10589.6,902434.0,...,60.0,6.0,6.0,16.0,1.0,72.0,3.0,9.0,14.0,1.0
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,Off,10589.6,902434.0,...,,,,,,,,,,
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,Off,10589.6,902434.0,...,72.0,3.0,9.0,14.0,1.0,84.0,7.0,9.0,20.0,3.0
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,Off,10589.6,902434.0,...,84.0,7.0,9.0,20.0,3.0,,,,,


In [5]:
target_to_trend = {
    'updrs_1': [5.394793062665313, 0.027091086167821344],
    'updrs_2': [5.469498130092747, 0.02824188329658148],
    'updrs_3': [21.182145576879183, 0.08897763331790556],
    'updrs_4': [-4.434453480103724, 0.07531448585334258]
}

In [6]:
def smape_plus_1(y_true, y_pred):
    y_true_plus_1 = y_true + 1
    y_pred_plus_1 = y_pred + 1
    metric = np.zeros(len(y_true_plus_1))
    
    numerator = np.abs(y_true_plus_1 - y_pred_plus_1)
    denominator = ((np.abs(y_true_plus_1) + np.abs(y_pred_plus_1)) / 2)
    
    mask_not_zeros = (y_true_plus_1 != 0) | (y_pred_plus_1 != 0)
    metric[mask_not_zeros] = numerator[mask_not_zeros] / denominator[mask_not_zeros]
    
    return 100 * np.nanmean(metric)

def calculate_month_trend_predicitons(pred_month, trend):
    if target == 'updrs_4': 
        pred_month = pred_month.clip(54, None)
    return trend[0] + pred_month * trend[1]

def calculate_predicitons_protein(protein, pred_month, protein_shift):
    trend_pred_month = target_to_trend[target]
    pred_month_trend = calculate_month_trend_predicitons(pred_month=pred_month, trend=trend_pred_month)
    return np.round(pred_month_trend + protein_shift)

def function_to_minimize(x):
    metric = smape_plus_1(
        y_true=y_true_array, 
        y_pred=calculate_predicitons_protein(
            protein=protein_array,
            pred_month=pred_month_array,
            protein_shift=x[0]
        )
    )
    return metric

In [7]:
def find_best_const(train_clinical_all_filtered, target):
    columns_with_target = [f'{target}_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
    columns_with_pred_month = [f'pred_month_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
    global y_true_array
    global pred_month_array
    global protein_array
    y_true_array = train_clinical_all_filtered[columns_with_target].values.ravel()
    pred_month_array = train_clinical_all_filtered[columns_with_pred_month].values.ravel()
    protein_array = np.concatenate([train_clinical_all_filtered[feature].values] * 4)
    result = minimize(
        fun=function_to_minimize,
        x0=[0.0],
        method='Powell'
    ).x[0]
    return result

## Plot shifts

In [8]:
feature = 'O15240'
# feature = 'O43505'
# display(train_clinical_all[feature])
# print(train_clinical_all[feature].quantile(0.25))
quantiles = [0, 0.05, 0.95, 1.0]

df_plot = []
for quantile_low, quantile_high in tqdm(zip(quantiles[:-1], quantiles[1:])):
    item = {
        'quantile_low': quantile_low,
        'quantile_high': quantile_high,
        'quantile_middle': (quantile_low + quantile_high) / 2
    }
    quantile_low_value = train_clinical_all[feature].quantile(quantile_low)
    quantile_high_value = train_clinical_all[feature].quantile(quantile_high)
    item['quantile_low_value'] = quantile_low_value
    item['quantile_high_value'] = quantile_high_value
    
    if quantile_high == 1:
        quantile_high_value += 0.00001
        
    train_clinical_all_filtered = train_clinical_all[
        (train_clinical_all[feature] >= quantile_low_value)
        & (train_clinical_all[feature] < quantile_high_value)
    ]
    for target in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
        item[f'{target}_shift'] = find_best_const(train_clinical_all_filtered, target)
    df_plot.append(item)
    
df_plot = pd.DataFrame(df_plot)

0it [00:00, ?it/s]

In [9]:
display(item)
display(df_plot)

{'quantile_low': 0.95,
 'quantile_high': 1.0,
 'quantile_middle': 0.975,
 'quantile_low_value': 262189.50000000006,
 'quantile_high_value': 527697.0,
 'updrs_1_shift': -1.0263282508723692,
 'updrs_2_shift': -1.1431113490848273,
 'updrs_3_shift': -5.938594166130957,
 'updrs_4_shift': 1.776594685014621e-11}

Unnamed: 0,quantile_low,quantile_high,quantile_middle,quantile_low_value,quantile_high_value,updrs_1_shift,updrs_2_shift,updrs_3_shift,updrs_4_shift
0,0.0,0.05,0.025,10717.4,31902.0,2.884078,4.215438,11.58533,1.776595e-11
1,0.05,0.95,0.5,31902.0,262189.5,0.389562,1.406451e-11,1.68518e-11,1.776595e-11
2,0.95,1.0,0.975,262189.5,527697.0,-1.026328,-1.143111,-5.938594,1.776595e-11


In [10]:
for target in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
    fig = px.line(
        df_plot,
        y=f'{target}_shift',
        x='quantile_middle',
        title=feature + ' ' + target
    )
    fig.show()

## Find shifts

In [11]:
npx_groups = [
    {'quantile_low': 0.0, 'quantile_high': 0.05},
    {'quantile_low': 0.05, 'quantile_high': 0.95},
    {'quantile_low': 0.95, 'quantile_high': 1.0},
]
target_to_npx_groups_shift = defaultdict(list)

for target in ['updrs_1', 'updrs_2', 'updrs_3']:
    for npx_group in npx_groups:
        item = npx_group.copy()
        item['feature'] = feature
        
        if item['quantile_low'] == 0:
            item['quantile_low_value'] = -np.inf
        else:
            item['quantile_low_value'] = train_clinical_all[feature].quantile(item['quantile_low'])
            
        if item['quantile_high'] == 1:
            item['quantile_high_value'] = np.inf
        else: 
            item['quantile_high_value'] = train_clinical_all[feature].quantile(item['quantile_high'])

        train_clinical_all_filtered = train_clinical_all[
            (train_clinical_all[feature] >= item['quantile_low_value'])
            & (train_clinical_all[feature] < item['quantile_high_value'])
        ]
        
        item['shift'] = find_best_const(train_clinical_all_filtered, target)
        target_to_npx_groups_shift[target].append(item)

In [12]:
target_to_npx_groups_shift

defaultdict(list,
            {'updrs_1': [{'quantile_low': 0.0,
               'quantile_high': 0.05,
               'feature': 'O15240',
               'quantile_low_value': -inf,
               'quantile_high_value': 31902.0,
               'shift': 2.8840777634444907},
              {'quantile_low': 0.05,
               'quantile_high': 0.95,
               'feature': 'O15240',
               'quantile_low_value': 31902.0,
               'quantile_high_value': 262189.50000000006,
               'shift': 0.3895616546928739},
              {'quantile_low': 0.95,
               'quantile_high': 1.0,
               'feature': 'O15240',
               'quantile_low_value': 262189.50000000006,
               'quantile_high_value': inf,
               'shift': -1.0263282508723692}],
             'updrs_2': [{'quantile_low': 0.0,
               'quantile_high': 0.05,
               'feature': 'O15240',
               'quantile_low_value': -inf,
               'quantile_high_value': 31902.0

## Predictions

In [None]:
amp_pd_peptide.make_env.func_dict['__called__'] = False
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

proteins_features_all = pd.DataFrame()
# The API will deliver four dataframes in this specific order:
for test_clinical_data, test_peptides, test_proteins, sample_submission in iter_test:
    sample_submission['patient_id'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[0]))
    sample_submission['visit_month'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[1]))
    sample_submission['target_name'] = sample_submission['prediction_id'].map(lambda x: 'updrs_' + x.split('_')[3])
    sample_submission['plus_month'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[5]))
    sample_submission['pred_month'] = sample_submission['visit_month'] + sample_submission['plus_month']
    sample_submission['visit_id'] = sample_submission['patient_id'].astype(str) + '_' + sample_submission['visit_month'].astype(str)
    
    proteins_features = pd.pivot_table(test_proteins, values='NPX', index='visit_id', columns='UniProt', aggfunc='sum')
    proteins_features['visit_id'] = proteins_features.index
    proteins_features_all = pd.concat([proteins_features_all, proteins_features])
    proteins_features_all['patient_id'] = proteins_features_all.index.map(lambda x: int(x.split('_')[0]))
    proteins_features_all[proteins_features.columns] = proteins_features_all.groupby('patient_id')[proteins_features.columns].\
                                                                                                   fillna(method='ffill')
    proteins_features = proteins_features_all.groupby('patient_id', as_index=False).last()
    
    sample_submission = sample_submission.merge(
        proteins_features,
        on='patient_id',
        how='left'
    )

    for i in range(1, 5):
        target = f'updrs_{i}'
        mask_target = sample_submission['target_name'] == target
        sample_submission.loc[mask_target, 'rating'] = calculate_month_trend_predicitons(
            pred_month=sample_submission.loc[mask_target, 'pred_month'],
            trend=target_to_trend[target]
        )
        
        for item in target_to_npx_groups_shift[target]:
            feature = item['feature']
            mask_feature_range = mask_target & (
                (sample_submission[feature] >= item['quantile_low_value'])
                & (sample_submission[feature] < item['quantile_high_value'])
            )
            sample_submission.loc[mask_feature_range, 'rating'] += item['shift']

        sample_submission.loc[mask_target, 'rating'] = np.round(sample_submission.loc[mask_target, 'rating'])
        
    # call the env.predict for every iteration
    env.predict(sample_submission[['prediction_id', 'rating']])