In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/public_timeseries_testing_util.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/__init__.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test

In [2]:
# Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import tqdm
import re


from itertools import product
from functools import reduce

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option('display.float_format',lambda x: '%.3f' % x)

In [3]:
# Reading the datasets
protein_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
peptides_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
target_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
protein_data.shape, peptides_data.shape, target_data.shape


((232741, 5), (981834, 6), (2615, 8))

In [4]:
# Creating targets
id_cols = ['visit_id','patient_id','visit_month']
target_cols = ['updrs_1','updrs_2','updrs_3','updrs_4']

# Sorting target_data by patient_id and visit_month
target_data.sort_values(['patient_id','visit_month'],inplace=True)
target_data.sort_values(['patient_id','visit_month'],inplace=True)


In [5]:
# Filling missing values in updrs_4 by 0
target_data1 = target_data[id_cols + target_cols].copy()
target_data1['updrs_4'].fillna(0,inplace = True)
target_data1.dropna(inplace = True)
target_data1.shape

(2588, 7)

In [6]:
def create_features(peptides_data,protein_data,target_data1):

    # Processing peptides data and protein data 
    pep_pro_data = peptides_data.merge(protein_data,on = id_cols + ['UniProt'],how = 'left')
    peptides_list = ['QGVNDNEEGFFSAR','AGLAASLAGPHSIVGR','MELERPGGNEITR','KTSLEDFYLDEER','RYIETDPANRDR','EWVAIESDSVQPVPR','LQDLYSIVR','AIQLTYNPDESSKPNMIDAATLK']

    pep_pro_data =  pep_pro_data[pep_pro_data.Peptide.isin(peptides_list)].copy()

    # Some features based on id cols 
    visit_month_summary = pep_pro_data.groupby('visit_month')[['PeptideAbundance','NPX']].agg(['min','max','mean','median','sum','std'])
    visit_month_summary.columns = [i+'_'+j for i,j in visit_month_summary.columns.tolist()]

    # Calculating ratio columns of NPX and PeptideAbundance
    for i in ['min','max','mean','median','sum','std']:
        visit_month_summary['pepab_to_npx_ratio_'+i] = visit_month_summary[f'PeptideAbundance_{i}']/visit_month_summary[f'NPX_{i}']

    # Patient level summary
    patient_summary = pep_pro_data.groupby('patient_id')[['PeptideAbundance','NPX']].agg(['min','max','mean','median','sum','std'])
    patient_summary.columns = [i+'_'+j for i,j in patient_summary.columns.tolist()]

    # Calculating ratio columns of NPX and PeptideAbundance
    for i in ['min','max','mean','median','sum','std']:
        patient_summary['pepab_to_npx_ratio_'+i] = patient_summary[f'PeptideAbundance_{i}']/patient_summary[f'NPX_{i}']

    # Creating PeptideAbundance / NPX
    pep_pro_data['pepab_to_npx_ratio'] = pep_pro_data['PeptideAbundance']/pep_pro_data['NPX']

    temp = pep_pro_data.pivot(index= id_cols,columns = 'Peptide',values = ['PeptideAbundance','NPX','pepab_to_npx_ratio'])
    temp.columns = [j+'_'+i for i,j in temp.columns]
    temp.reset_index(inplace = True)
    temp.fillna(0,inplace = True)

    pep_pro_features = temp.merge(patient_summary.reset_index(),on = 'patient_id').merge(
        visit_month_summary.reset_index(),on = 'visit_month',suffixes = ('_patient','_visit_month'))
    
    if target_data1 is None:
        return pep_pro_features
    else:
        pep_pro_features = pep_pro_features.merge(target_data1,on = id_cols,how = 'inner')   
        return pep_pro_features
                                                                         
                                                                         

In [7]:
from sklearn.feature_selection import SelectKBest, f_regression, RFE, RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgbm
import xgboost as xgbm

def smape(y_true, y_pred):
    y_true = 1+y_true
    y_pred = 1+y_pred
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

smape_ = make_scorer(smape,greater_is_better=False)

def feature_selector(X,y,k,n_splits,groups):

    # scaler = StandardScaler()
    # scaler.fit(X)
    # X = pd.DataFrame(scaler.transform(X),columns = X.columns.tolist(), index = X.index.tolist())
    
    # First feature selection using correlation and second using RFECV
    selector = SelectKBest(score_func=f_regression, k=k)
    feature_names = X.columns.tolist()

    # fit selector to the data and transform the feature matrix
    X_selected = selector.fit_transform(X, y)

    # get the indices of the selected features
    selected_indices = selector.get_support(indices=True)
    selected_features = [feature_names[i] for i in selected_indices]
    
#     print(selected_features)

    # instantiate a ML model
    lgbm = lgb.LGBMRegressor(n_jobs=-1)
    # xgbm = xgb.XGBRegressor()

    # Initialize GroupKFold with 10 folds
    kf = GroupKFold(n_splits=n_splits)

    rfecv = RFECV(estimator=lgbm, min_features_to_select=10,cv = kf.split(X, y, groups), step = 1,scoring = smape_)

    # fit RFE to the data
    rfecv.fit(X[selected_features], y)

    feature_rank = pd.DataFrame({'features' : selected_features,'rank':rfecv.ranking_}).sort_values('rank')
    selected_features2 = feature_rank[feature_rank['rank'] < 2].features.tolist()
    
    return selected_features

In [8]:
# # Selecting features
# model_data = create_features(peptides_data,protein_data,target_data1)
# model_x = model_data.drop(columns = id_cols +  target_cols).fillna(0).copy()
# model_y = model_data[target_cols].sum(axis = 1).values
# groups = model_data['patient_id']
# sel_features = feature_selector(model_x,model_y,40,10,groups)

In [9]:
# print(sel_features )
sel_features = ['AIQLTYNPDESSKPNMIDAATLK_PeptideAbundance', 'EWVAIESDSVQPVPR_PeptideAbundance', 'LQDLYSIVR_PeptideAbundance', 'MELERPGGNEITR_PeptideAbundance', 'QGVNDNEEGFFSAR_PeptideAbundance', 'AGLAASLAGPHSIVGR_NPX', 'AIQLTYNPDESSKPNMIDAATLK_NPX', 'EWVAIESDSVQPVPR_NPX', 'KTSLEDFYLDEER_NPX', 'LQDLYSIVR_NPX', 'MELERPGGNEITR_NPX', 'EWVAIESDSVQPVPR_pepab_to_npx_ratio', 'KTSLEDFYLDEER_pepab_to_npx_ratio', 'LQDLYSIVR_pepab_to_npx_ratio', 'RYIETDPANRDR_pepab_to_npx_ratio', 'PeptideAbundance_min_patient', 'PeptideAbundance_max_patient', 'PeptideAbundance_mean_patient', 'PeptideAbundance_median_patient', 'PeptideAbundance_std_patient', 'NPX_min_patient', 'NPX_max_patient', 'NPX_mean_patient', 'NPX_sum_patient', 'NPX_std_patient', 'pepab_to_npx_ratio_max_patient', 'pepab_to_npx_ratio_mean_patient', 'pepab_to_npx_ratio_sum_patient', 'pepab_to_npx_ratio_std_patient', 'PeptideAbundance_min_visit_month', 'PeptideAbundance_sum_visit_month', 'NPX_min_visit_month', 'NPX_max_visit_month', 'NPX_median_visit_month', 'NPX_sum_visit_month', 'NPX_std_visit_month', 'pepab_to_npx_ratio_min_visit_month', 'pepab_to_npx_ratio_max_visit_month', 'pepab_to_npx_ratio_median_visit_month', 'pepab_to_npx_ratio_std_visit_month']

In [10]:
# Loading libraries
from sklearn.model_selection import GroupKFold
from lightgbm import LGBMRegressor

In [11]:
# Function to create lgb models

def train_models_lgb(model_data,target_cols,sel_features, params):    

    groups = model_data['patient_id']

    # Initialize GroupKFold with 10 folds
    kf = GroupKFold(n_splits=10)
    
    # Initialize a dictionary to store the models and scores
    models = {}
    scores_trn = {} ; scores_val = {}
    scores2_trn = {} ; scores2_val = {}
    
    # Loop through each combination of hyperparameters
    for i, target in enumerate(target_cols):

        X = model_data[sel_features].copy()
        y = model_data[target].copy() 

        # Initialize variables for storing predictions and actual values
        predictions_val = np.zeros(len(X))
        predictions2_val = np.zeros(len(X))
        actuals_val = np.zeros(len(X))
        
        predictions_trn = np.zeros(len(X))
        predictions2_trn = np.zeros(len(X))
        actuals_trn = np.zeros(len(X))
        
        model_temp = []
        # Loop through each fold
        for train_idx, test_idx in kf.split(X, y, groups):
            # Get training and validation data for the fold
            X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
            
            # Initialize LightGBM Regressor
            model = LGBMRegressor(**params, random_state=2023)

            # Train the model
            model.fit(X_train, y_train)

            # Make predictions on validation data for the fold
            fold_preds = model.predict(X_val)
            trn_preds = model.predict(X_train)

            # Store predictions and actuals for the fold
            predictions_val[test_idx] = fold_preds
            actuals_val[test_idx] = y_val

            # Store predictions and actuals for the fold
            predictions_trn[train_idx] = trn_preds
            actuals_trn[train_idx] = y_train

            if target == 'updrs_4':
                predictions2_val[test_idx] = 0
                predictions2_trn[train_idx] = 0
            else:
                predictions2_val[test_idx] = fold_preds
                predictions2_trn[train_idx] = trn_preds
                
            model_temp += [model]

                
            
        # Calculate the validation score for the model
        score_val = smape(actuals_val,predictions_val)   
        score2_val  = smape(actuals_val,predictions2_val)
        
        score_trn  = smape(actuals_trn,predictions_trn)
        score2_trn  = smape(actuals_trn,predictions2_trn)

        # Print the train score for the model
        print(f"\nModel for {target} train score: {score_trn:.3f}")
        
        # Print the validation score for the model
        print(f"Model for {target} validation score: {score_val:.3f}")
        
        if target == 'updrs_4':
            
            print(f"\nModel for {target} train score when pred is 0: {score2_trn:.3f}")
            # Print the validation score for the model
            print(f"Model for {target} validation score when pred is 0: {score2_val:.3f}")
        
        # Store the model and score in the dictionary
        models[target] = model_temp
        scores_trn[target] = score_trn
        scores2_trn[target] = score2_trn
        scores_val[target] = score_val
        scores2_val[target] = score2_val
    
    print(f"\nTrain score for all models : {np.mean(list(scores_trn.values())):3f}")
    print(f'Train score for all models when updrs_4 pred is 0 : {np.mean(list(scores2_trn.values())):3f}')
    
    print(f"\nValidation score for all models : {np.mean(list(scores_val.values())):3f}")
    
    print(f'Validation score for all models when updrs_4 pred is 0 : {np.mean(list(scores2_val.values())):3f}')

    return models 

In [12]:
# Variables for model
params = {
    'learning_rate' : 0.0005,
    'n_estimators' : 1500,
    'reg_alpha' : 5,
    'reg_lambda' : 5,
    'min_child_samples' : 30,
    'colsample_bytree' : 0.6,
    'subsample':0.6,
    'num_leaves' : 10,
    'max_depth' : 2
    }

In [13]:
# Creating features & Building model
model_data = create_features(peptides_data,protein_data,target_data1)
lgb_models = train_models_lgb(model_data,target_cols,sel_features,params)


Model for updrs_1 train score: 56.420
Model for updrs_1 validation score: 58.594

Model for updrs_2 train score: 73.844
Model for updrs_2 validation score: 76.563

Model for updrs_3 train score: 78.759
Model for updrs_3 validation score: 80.796

Model for updrs_4 train score: 65.659
Model for updrs_4 validation score: 68.876

Model for updrs_4 train score when pred is 0: 27.489
Model for updrs_4 validation score when pred is 0: 27.489

Train score for all models : 68.670176
Train score for all models when updrs_4 pred is 0 : 59.127742

Validation score for all models : 71.207466
Validation score for all models when updrs_4 pred is 0 : 60.860576


#### Testing the codes for example test data

In [14]:
# Reading the test files
test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv') 
test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv') 
sample_submission = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv')
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')

In [15]:
def get_predictions(test,peptides_data,protein_data,sample_submission,models,sel_features):

    submission_ids  = test.visit_id.tolist()
    features = create_features(test_peptides,test_proteins,None)
    features = pd.merge(test[['patient_id','visit_month']],features,on = ['patient_id','visit_month'],how = 'inner')
    features['visit_month_1'] = [0,6,12,24]*int(features.shape[0]/4) 

    # Imputing missing values by 0
    features.fillna(0,inplace = True)

    missing_features = list(set(sel_features).difference(features.columns.tolist()))
    features[missing_features] = 0
    print('missing features : \n',missing_features )

    # Making predictions from all the models
    pred_submission = pd.DataFrame(index = range(features.shape[0]))
    
    for key,value in models.items():
        pred_feats = features[sel_features]        
        pred_train_temp = pd.DataFrame(np.mean(np.array([np.array(mod.predict(pred_feats)) for mod in value]),axis = 0))
        pred_train_temp.columns = [key]
        pred_submission = pred_submission.join(pred_train_temp)
    
    pred_submission = pred_submission[models.keys()].copy()
    pred_submission.index = features['patient_id'].astype(str) + '_' +features['visit_month'].astype(str) + '_'+features['visit_month_1'].apply(lambda x: 'plus_'+str(x) +'_months')

    pred_submission['updrs_4'] = 0
    pred_submission = pred_submission.stack().reset_index().rename(columns = {'level_0':'prediction_id','level_1':'target',0:'rating'})
    pred_submission['prediction_id'] = pred_submission[['prediction_id','target']].apply(lambda x: '_'.join(x[0].split('_')[:2] + [x[1]] + x[0].split('_')[2:]) ,axis= 1)
    pred_submission.drop(columns = ['target'],inplace= True)
#     pred_submission['group_key'] = pred_submission['prediction_id'].apply(lambda x: x.split('_')[1]).astype('str')
    pred_submission.reset_index(drop = True,inplace = True)
    sample_submission = sample_submission.drop(columns = ['rating']).copy()
    sample_submission = sample_submission.merge(pred_submission,on = ['prediction_id'],how = 'left')
    
    
    # Calculating medians
    temp  = pred_submission.copy()
    temp['target'] = temp.prediction_id.apply(lambda x: '_'.join(x.split('_')[2:]))
    pred_medians = temp.groupby('target')['rating'].median()

    sample_submission['rating'] = sample_submission['rating'].fillna(
        
        sample_submission.prediction_id.apply(lambda x: '_'.join(x.split('_')[2:])).map(pred_medians)
    
    )
    
    sample_submission['rating'] = sample_submission['rating'].apply(lambda x: np.ceil(x))

    return sample_submission

In [16]:
temp  = get_predictions(test,test_peptides,test_proteins,sample_submission,lgb_models,sel_features)

missing features : 
 ['RYIETDPANRDR_pepab_to_npx_ratio']


#### Submitting to API

In [17]:
import sys
sys.path.append('/kaggle/input/amp-parkinsons-disease-progression-prediction/')

import amp_pd_peptide
amp_pd_peptide.make_env.func_dict['__called__'] = False
env = amp_pd_peptide.make_env()

iter_test = env.iter_test() 

In [18]:

for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    test = test
    submission = get_predictions(test, test_peptides, test_proteins, sample_submission, lgb_models,sel_features)
    submission = submission.drop_duplicates(subset=['prediction_id', 'rating'])
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
missing features : 
 ['RYIETDPANRDR_pepab_to_npx_ratio']
missing features : 
 ['RYIETDPANRDR_pepab_to_npx_ratio']
