In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from functools import partial

from tqdm.auto import tqdm
from collections import defaultdict

from scipy.optimize import minimize
from sklearn.model_selection import GroupKFold

# import amp_pd_peptide

In [2]:
train_clinical_all = pd.read_csv('./data/train_clinical_data.csv')
proteins = pd.read_csv('./data/train_proteins.csv')

proteins_features = pd.pivot_table(proteins, values='NPX', index='visit_id', columns='UniProt', aggfunc='sum')

train_clinical_all = train_clinical_all.merge(
    proteins_features,
    left_on='visit_id',
    right_index=True,
    how='left'
)

train_clinical_all[proteins_features.columns] = train_clinical_all.groupby('patient_id')[proteins_features.columns].fillna(method='ffill')
display(train_clinical_all)

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,O00391,O00533,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,55_0,55,0,10.0,6.0,15.0,,,11254.3,732430.0,...,365475.0,35528.0,97005.6,23122.5,60912.6,408698.0,,29758.8,23833.7,18953.5
1,55_3,55,3,10.0,7.0,25.0,,,11254.3,732430.0,...,365475.0,35528.0,97005.6,23122.5,60912.6,408698.0,,29758.8,23833.7,18953.5
2,55_6,55,6,8.0,10.0,34.0,,,13163.6,630465.0,...,405676.0,30332.6,109174.0,23499.8,51655.8,369870.0,,22935.2,17722.5,16642.7
3,55_9,55,9,8.0,9.0,30.0,0.0,On,13163.6,630465.0,...,405676.0,30332.6,109174.0,23499.8,51655.8,369870.0,,22935.2,17722.5,16642.7
4,55_12,55,12,10.0,10.0,41.0,0.0,On,15257.6,815083.0,...,303953.0,43026.2,114921.0,21860.1,61598.2,318553.0,65762.6,29193.4,28536.1,19290.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,10589.6,902434.0,...,223136.0,21659.9,116470.0,14073.3,48796.4,320821.0,39046.7,,20198.8,39535.0
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,Off,10589.6,902434.0,...,223136.0,21659.9,116470.0,14073.3,48796.4,320821.0,39046.7,,20198.8,39535.0
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,Off,10589.6,902434.0,...,223136.0,21659.9,116470.0,14073.3,48796.4,320821.0,39046.7,,20198.8,39535.0
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,Off,10589.6,902434.0,...,223136.0,21659.9,116470.0,14073.3,48796.4,320821.0,39046.7,,20198.8,39535.0


In [3]:
train_clinical_all['pred_month'] = train_clinical_all['visit_month']

for plus_month in [6, 12, 24]:
    train_shift = train_clinical_all[['patient_id', 'visit_month', 'pred_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']].copy()
    train_shift['visit_month'] -= plus_month
    train_shift.rename(columns={f'updrs_{i}': f'updrs_{i}_plus_{plus_month}' for i in range(1, 5)}, inplace=True)
    train_shift.rename(columns={'pred_month': f'pred_month_plus_{plus_month}'}, inplace=True)
    train_clinical_all = train_clinical_all.merge(train_shift, how='left', on=['patient_id', 'visit_month'])

train_clinical_all.rename(columns={f'updrs_{i}': f'updrs_{i}_plus_0' for i in range(1, 5)}, inplace=True)
train_clinical_all.rename(columns={'pred_month': f'pred_month_plus_0'}, inplace=True)
train_clinical_all

  train_clinical_all['pred_month'] = train_clinical_all['visit_month']


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1_plus_0,updrs_2_plus_0,updrs_3_plus_0,updrs_4_plus_0,upd23b_clinical_state_on_medication,O00391,O00533,...,pred_month_plus_12,updrs_1_plus_12,updrs_2_plus_12,updrs_3_plus_12,updrs_4_plus_12,pred_month_plus_24,updrs_1_plus_24,updrs_2_plus_24,updrs_3_plus_24,updrs_4_plus_24
0,55_0,55,0,10.0,6.0,15.0,,,11254.3,732430.0,...,12.0,10.0,10.0,41.0,0.0,24.0,16.0,9.0,49.0,0.0
1,55_3,55,3,10.0,7.0,25.0,,,11254.3,732430.0,...,,,,,,,,,,
2,55_6,55,6,8.0,10.0,34.0,,,13163.6,630465.0,...,18.0,7.0,13.0,38.0,0.0,30.0,14.0,13.0,49.0,0.0
3,55_9,55,9,8.0,9.0,30.0,0.0,On,13163.6,630465.0,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,On,15257.6,815083.0,...,24.0,16.0,9.0,49.0,0.0,36.0,17.0,18.0,51.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,10589.6,902434.0,...,60.0,6.0,6.0,16.0,1.0,72.0,3.0,9.0,14.0,1.0
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,Off,10589.6,902434.0,...,,,,,,,,,,
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,Off,10589.6,902434.0,...,72.0,3.0,9.0,14.0,1.0,84.0,7.0,9.0,20.0,3.0
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,Off,10589.6,902434.0,...,84.0,7.0,9.0,20.0,3.0,,,,,


In [4]:
target_to_trend = {
    'updrs_1': [5.394793062665313, 0.027091086167821344],
    'updrs_2': [5.469498130092747, 0.02824188329658148],
    'updrs_3': [21.182145576879183, 0.08897763331790556],
    'updrs_4': [-4.434453480103724, 0.07531448585334258]
}

In [5]:
target_to_npx_groups_shift = defaultdict(list,
            {'updrs_1': [{'quantile_low': 0.0,
               'quantile_high': 0.05,
               'feature': 'O15240',
               'quantile_low_value': -float('inf'),
               'quantile_high_value': 31902.0,
               'shift': 2.8840777634444907},
              {'quantile_low': 0.05,
               'quantile_high': 0.95,
               'feature': 'O15240',
               'quantile_low_value': 31902.0,
               'quantile_high_value': 262189.50000000006,
               'shift': 0.3895616546928739},
              {'quantile_low': 0.95,
               'quantile_high': 1.0,
               'feature': 'O15240',
               'quantile_low_value': 262189.50000000006,
               'quantile_high_value': float('inf'),
               'shift': -1.0263282508723692}],
             'updrs_2': [{'quantile_low': 0.0,
               'quantile_high': 0.05,
               'feature': 'O15240',
               'quantile_low_value': -float('inf'),
               'quantile_high_value': 31902.0,
               'shift': 4.215438267726712},
              {'quantile_low': 0.05,
               'quantile_high': 0.95,
               'feature': 'O15240',
               'quantile_low_value': 31902.0,
               'quantile_high_value': 262189.50000000006,
               'shift': 1.4064511679714293e-11},
              {'quantile_low': 0.95,
               'quantile_high': 1.0,
               'feature': 'O15240',
               'quantile_low_value': 262189.50000000006,
               'quantile_high_value': float('inf'),
               'shift': -1.1431113490848273}],
             'updrs_3': [{'quantile_low': 0.0,
               'quantile_high': 0.05,
               'feature': 'O15240',
               'quantile_low_value': -float('inf'),
               'quantile_high_value': 31902.0,
               'shift': 11.585334574122896},
              {'quantile_low': 0.05,
               'quantile_high': 0.95,
               'feature': 'O15240',
               'quantile_low_value': 31902.0,
               'quantile_high_value': 262189.50000000006,
               'shift': 1.6851803891920746e-11},
              {'quantile_low': 0.95,
               'quantile_high': 1.0,
               'feature': 'O15240',
               'quantile_low_value': 262189.50000000006,
               'quantile_high_value': float('inf'),
               'shift': -5.938594166130957}]})

In [6]:
def smape_plus_1(y_true, y_pred):
    y_true_plus_1 = y_true + 1
    y_pred_plus_1 = y_pred + 1
    metric = np.zeros(len(y_true_plus_1))
    
    numerator = np.abs(y_true_plus_1 - y_pred_plus_1)
    denominator = ((np.abs(y_true_plus_1) + np.abs(y_pred_plus_1)) / 2)
    
    mask_not_zeros = (y_true_plus_1 != 0) | (y_pred_plus_1 != 0)
    metric[mask_not_zeros] = numerator[mask_not_zeros] / denominator[mask_not_zeros]
    
    return 100 * np.nanmean(metric)

def calculate_month_trend_predicitons(pred_month, trend, target):
    if target == 'updrs_4':
        pred_month = pred_month.clip(54, None)
    if len(trend) == 2:
        return np.round(trend[0] + pred_month * trend[1]) # linear prediction
    return np.round(trend[0] + pred_month * trend[1] + np.square(pred_month) * trend[2]) # quadratic prediction

def calculate_predicitons_protein(pred_month, protein_shift, target):
    trend_pred_month = target_to_trend[target]
    pred_month_trend = calculate_month_trend_predicitons(pred_month=pred_month, trend=trend_pred_month, target=target)
    return np.round(pred_month_trend + protein_shift)

def function_to_minimize_trend(x, y_true_array_tr, pred_month_array_tr, target):
    metric = smape_plus_1(
        y_true=y_true_array_tr,
        y_pred=calculate_month_trend_predicitons(
            pred_month=pred_month_array_tr,
            trend=x,
            target=target
        )
    )
    return metric

def function_to_minimize_shift(x, y_true_array, pred_month_array, protein_array):
    metric = smape_plus_1(
        y_true=y_true_array,
        y_pred=calculate_predicitons_protein(
            protein=protein_array,
            pred_month=pred_month_array,
            protein_shift=x[0]
        )
    )
    return metric

def model(y_true_array_tr, pred_month_array_tr, target):
    """Fits a linear or quadratic model to the given data"""
    return list(minimize(
        fun=partial(function_to_minimize_trend,
                    y_true_array_tr=y_true_array_tr,
                    pred_month_array_tr=pred_month_array_tr,
                    target=target
                   ),
        # if x0 has two elements, the predictions will be linear
        # if x0 has three elements, the predictions will be quadratic
        x0=[0, 0.0048] if target != 'updrs_3' else [0, 0, 0],
        method='Powell'
    ).x)

In [7]:
def find_best_const(train_clinical_all_filtered, target):
    columns_with_target = [f'{target}_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
    columns_with_pred_month = [f'pred_month_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
    global y_true_array
    global pred_month_array
    global protein_array
    y_true_array = train_clinical_all_filtered[columns_with_target].values.ravel()
    pred_month_array = train_clinical_all_filtered[columns_with_pred_month].values.ravel()
    protein_array = np.concatenate([train_clinical_all_filtered[feature].values] * 4)
    result = minimize(
        fun=function_to_minimize_shift,
        x0=[0.0],
        method='Powell'
    ).x[0]
    return result

In [9]:
all_score_list = []
for i in range(1, 5):
    target = f'updrs_{i}'
    columns_with_target = [f'{target}_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
    columns_with_pred_month = [f'pred_month_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
    kf = GroupKFold(n_splits=5)
    score_list = []
    for f, (idx_tr, idx_va) in enumerate(kf.split(train_clinical_all, groups=train_clinical_all.patient_id)):
        y_true_array_tr = train_clinical_all.iloc[idx_tr][columns_with_target].values.ravel()
        pred_month_array_tr = train_clinical_all.iloc[idx_tr][columns_with_pred_month].values.ravel()
        # trend = model(y_true_array_tr, pred_month_array_tr, target)
        
        y_true_array_va = train_clinical_all.iloc[idx_va][columns_with_target].values.ravel()
        pred_month_array_va = train_clinical_all.iloc[idx_va][columns_with_pred_month].values.ravel()
        for item in target_to_trend[target]:
            feature = item['feature']
            if train_clinical_all.iloc[idx_va][feature] >= item['quantile_low_value'] and train_clinical_all.iloc[idx_va][feature] < item['quantile_high_value']:
                protein_shift = item['shift']
        score = smape_plus_1(y_true_array_va, calculate_predicitons_protein(pred_month_array_va, protein_shift, target))
        print(f"{target} fold {f}: {score:.2f}")
        score_list.append(score)
    score = np.array(score_list).mean()
    print(f"{target}                 {score:.2f}")
    all_score_list.append(score)
    
print(f"cv score                       {np.array(all_score_list).mean():.2f}")

TypeError: 'float' object is not subscriptable

In [None]:
feature = 'O15240'
# feature = 'O43505'
# display(train_clinical_all[feature])
# print(train_clinical_all[feature].quantile(0.25))
quantiles = [0, 0.05, 0.95, 1.0]

df_plot = []
for quantile_low, quantile_high in tqdm(zip(quantiles[:-1], quantiles[1:])):
    item = {
        'quantile_low': quantile_low,
        'quantile_high': quantile_high,
        'quantile_middle': (quantile_low + quantile_high) / 2
    }
    quantile_low_value = train_clinical_all[feature].quantile(quantile_low)
    quantile_high_value = train_clinical_all[feature].quantile(quantile_high)
    item['quantile_low_value'] = quantile_low_value
    item['quantile_high_value'] = quantile_high_value
    
    if quantile_high == 1:
        quantile_high_value += 0.00001
        
    train_clinical_all_filtered = train_clinical_all[
        (train_clinical_all[feature] >= quantile_low_value)
        & (train_clinical_all[feature] < quantile_high_value)
    ]
    for target in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
        item[f'{target}_shift'] = find_best_const(train_clinical_all_filtered, target)
    df_plot.append(item)
    
df_plot = pd.DataFrame(df_plot)

In [None]:
npx_groups = [
    {'quantile_low': 0.0, 'quantile_high': 0.05},
    {'quantile_low': 0.05, 'quantile_high': 0.95},
    {'quantile_low': 0.95, 'quantile_high': 1.0},
]
target_to_npx_groups_shift = defaultdict(list)

for target in ['updrs_1', 'updrs_2', 'updrs_3']:
    for npx_group in npx_groups:
        item = npx_group.copy()
        item['feature'] = feature
        
        if item['quantile_low'] == 0:
            item['quantile_low_value'] = -np.inf
        else:
            item['quantile_low_value'] = train_clinical_all[feature].quantile(item['quantile_low'])
            
        if item['quantile_high'] == 1:
            item['quantile_high_value'] = np.inf
        else: 
            item['quantile_high_value'] = train_clinical_all[feature].quantile(item['quantile_high'])

        train_clinical_all_filtered = train_clinical_all[
            (train_clinical_all[feature] >= item['quantile_low_value'])
            & (train_clinical_all[feature] < item['quantile_high_value'])
        ]
        
        item['shift'] = find_best_const(train_clinical_all_filtered, target)
        target_to_npx_groups_shift[target].append(item)

In [None]:
amp_pd_peptide.make_env.func_dict['__called__'] = False
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

proteins_features_all = pd.DataFrame()
# The API will deliver four dataframes in this specific order:
for test_clinical_data, test_peptides, test_proteins, sample_submission in iter_test:
    sample_submission['patient_id'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[0]))
    sample_submission['visit_month'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[1]))
    sample_submission['target_name'] = sample_submission['prediction_id'].map(lambda x: 'updrs_' + x.split('_')[3])
    sample_submission['plus_month'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[5]))
    sample_submission['pred_month'] = sample_submission['visit_month'] + sample_submission['plus_month']
    sample_submission['visit_id'] = sample_submission['patient_id'].astype(str) + '_' + sample_submission['visit_month'].astype(str)
    
    proteins_features = pd.pivot_table(test_proteins, values='NPX', index='visit_id', columns='UniProt', aggfunc='sum')
    proteins_features['visit_id'] = proteins_features.index
    proteins_features_all = pd.concat([proteins_features_all, proteins_features])
    proteins_features_all['patient_id'] = proteins_features_all.index.map(lambda x: int(x.split('_')[0]))
    proteins_features_all[proteins_features.columns] = proteins_features_all.groupby('patient_id')[proteins_features.columns].\
                                                                                                   fillna(method='ffill')
    proteins_features = proteins_features_all.groupby('patient_id', as_index=False).last()
    
    sample_submission = sample_submission.merge(
        proteins_features,
        on='patient_id',
        how='left'
    )

    for i in range(1, 5):
        target = f'updrs_{i}'
        mask_target = sample_submission['target_name'] == target
        sample_submission.loc[mask_target, 'rating'] = calculate_month_trend_predicitons(
            pred_month=sample_submission.loc[mask_target, 'pred_month'],
            trend=target_to_trend[target]
        )
        
        for item in target_to_npx_groups_shift[target]:
            feature = item['feature']
            mask_feature_range = mask_target & (
                (sample_submission[feature] >= item['quantile_low_value'])
                & (sample_submission[feature] < item['quantile_high_value'])
            )
            sample_submission.loc[mask_feature_range, 'rating'] += item['shift']

        sample_submission.loc[mask_target, 'rating'] = np.round(sample_submission.loc[mask_target, 'rating'])
        
    # call the env.predict for every iteration
    env.predict(sample_submission[['prediction_id', 'rating']])