In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
sample_submission = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv')
train_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')
train_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')
train_clinical = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')
sup = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')

In [None]:
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
import pandas as pd 


def effectif_int(data):
    df = data.select_dtypes(include = ['int','float'])
    df = df.describe().T
    df.drop(columns='count', inplace=True)
    return df 

def type_of_vars(data):
    typ_var = pd.DataFrame(data.dtypes)
    typ_var.columns = ['Types variables']
    return typ_var

 

# class to plot missing values, convert types of variables 

class Descriptive_analysis():
  """ 
    Class to compute all types of descriptives analysis steps 
  """
  def __init__(self,
              objective:str = 'survival') -> None:
    self.objective = objective

  def missing_values(self,
                  X:pd.DataFrame):
    """ Compute missing values df """

    self.X = X

    #compute df of missing values sorted 
    self.miss = pd.DataFrame(X.isnull().sum())
    self.miss.columns = ['Nans']
    self.miss = self.miss.sort_values( by= ['Nans'], ascending= False)
    self.miss = self.miss[self.miss.Nans != 0]
    self.miss.reset_index(inplace=True)
    self.miss = self.miss.rename(columns = {'index': 'Variables'})
    if len(self.miss) == 0:
        print('There is no missing values')
    else :
        return self.miss
  

  def plot_missing_values(self, X:pd.DataFrame,
                              fig_size:list=[20,7],
                              size_police: int= 10,
                              threshold:float =None):
    """" Plot missing values bar"""
    self.X = X 
    self.fig_size = fig_size
    self.size_police = size_police
    self.threshold = threshold

    if self.threshold == None:
      self.threshold = np.round(X.shape[0]/2, 2)

    #compute df for missing values
    self.miss = self.missing_values(self.X)

    #ploting features

    plt.figure(figsize=(self.fig_size[0], self.fig_size[1]))
    g = sns.barplot(x="Variables", y="Nans", data=self.miss[self.miss.Nans > self.threshold])
    total = len(self.X)
    for p in g.patches:
      percentage = '{:.1f}%'.format(100 * p.get_height()/total)
      x = p.get_x() + p.get_width() / 2 - 0.05
      y = p.get_y() + p.get_height()
      g.annotate(percentage, (x, y), size = 10)
    plt.title('Variables with more than ' + str(self.threshold)+ '% missing values' )
    plt.show()


  def define_dtypes(self, X:pd.DataFrame, 
                          category_columns:list=None,
                          str_columns:list=None,
                          date_columns:list=None,
                          format_date:str=None,
                          num_columns:str=None):
    """ Convert variables """
    self.X = X
    self.category_columns = category_columns
    self.str_columns = str_columns
    self.date_columns = date_columns
    self.format_date = format_date
    self.num_columns = num_columns

    self.X[self.category_columns] = self.X[self.category_columns].astype('category')
    self.X[self.str_columns] = self.X[self.str_columns].astype('str')
    # date convertion
    for date_c in self.date_columns:
      self.X[date_c] = pd.to_datetime(self.X[date_c], format=self.format_date , errors='coerce')
    if self.num_columns != None : 
      self.X[self.num_columns] = self.X[self.num_columns].astype('float')
    else : 
      list_other_variables = list(set(self.X.columns)- set(self.category_columns))
      list_other_variables = list(set(list_other_variables)- set(self.str_columns))
      list_other_variables = list(set(list_other_variables)- set(self.date_columns))
      self.X[list_other_variables] = self.X[list_other_variables].astype('float')
    
    return self.X

def concat_data(train_proteins, train_peptides, train_clinical):
    # merge proteins and peptides 
    train_r = pd.merge(train_peptides, train_proteins, how='left', on=['visit_id','visit_month','patient_id','UniProt'])
    # merge clinical 
    train_r = pd.merge(train_r, train_clinical, how='left', on=['visit_id','visit_month','patient_id'])
    return train_r

# metrics 
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap) 


def mean_smape(y_true, y_pred):
    res = 0
    for i in range(y_true.shape[1]):
        res = res + smape(y_true.iloc[:,i], y_pred.iloc[:,1])
    return (res / 4) 


In [None]:
train_r = concat_data(train_proteins, train_peptides, train_clinical)
print(train_r.shape)

# subsection of months of interest 
#train_r = train_r[train_r.visit_month.isin([0,6,12,24])].reset_index(drop=True)
print(train_r.shape)
train_r.head()



In [None]:
sup

In [None]:
# drop the colums with more than 70% null values 
train_r.drop(columns = 'upd23b_clinical_state_on_medication', inplace = True)
# drop null values from outcome 
outcome_miss =  np.where(train_r.updrs_4.isnull() == True)[0].tolist()+ np.where(train_r.updrs_1.isnull() == True)[0].tolist() + np.where(train_r.updrs_2.isnull() == True)[0].tolist() + np.where(train_r.updrs_3.isnull() == True)[0].tolist() 
outcome_miss = np.unique(outcome_miss)

train_full = train_r.drop(index = outcome_miss).reset_index(drop=True)
train_full.visit_id = train_full.visit_id.astype('int')
print(train_r.shape, train_full.shape)

In [None]:
targets = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4'] 
# output and variables separation 
X, y = train_full.drop(columns =targets), train_full[targets]
# separate dtypes 
col_int = X.select_dtypes(exclude = 'object').columns.tolist()
# dummy variables 
X_ = X.drop(columns = ['UniProt', 'Peptide']) # pd.get_dummies(X)
#col_all = X_.columns.tolist()
#col_cat = list(set(col_all)-set(col_int))
#X_[col_cat] = X_[col_cat].astype('category')
X_.head()

In [None]:
# Cross validation 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
# split train test 
Xtrain, Xtest, ytrain, ytest = train_test_split( X_, y, test_size=0.30, random_state=132)

In [None]:
ytrain.isnull().sum()

In [None]:
rf = RandomForestRegressor(random_state=15).fit(Xtrain,ytrain)

In [None]:
ypred_rf = rf.predict(Xtest)
ypred_rf = pd.DataFrame(ypred_rf, columns = targets)
ypred_rf

In [None]:
ytest

# TEST TEST TEST

In [None]:
test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')
test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')

In [None]:
tes = concat_data(test_proteins, test_peptides, test.drop_duplicates('visit_id'))
tes

In [None]:
testt = tes.drop(columns=[  'group_key_x',   'group_key_y', 'updrs_test',
       'row_id', 'group_key'])
testt[['visit_id', 'visit_month', 'patient_id',  
       'PeptideAbundance',   'NPX'  ]] = testt[['visit_id', 'visit_month', 'patient_id',  
       'PeptideAbundance',  'NPX' ]].astype('int')
testt = testt[Xtrain.columns]
testt 

In [None]:
rf.predict(testt)

In [None]:
test_pred = pd.DataFrame(rf.predict(testt))
test_pred

In [None]:
testt

In [None]:
testt_tr = testt.copy()
testt_tr[['visit_id', 'visit_month','patient_id']] = testt_tr[['visit_id', 'visit_month','patient_id']].astype('str')
testt_tr

In [None]:
 testt_tr.visit_id + str('_') + testt_tr.visit_month

In [None]:
test_pred['predi_id'] = testt.visit_id

In [None]:
col_pred = Xtrain.columns.tolist()
col_pred

In [None]:
import amp_pd_peptide


In [None]:
sample_submission

In [None]:
def map_test(x):
    updrs = x.split('_')[2] + '_' + x.split('_')[3]
    month = int(x.split('_plus_')[1].split('_')[0])
    visit_id = x.split('_')[0] + '_' + x.split('_')[1]
    
    print(updrs, month, visit_id)
    # set all predictions 0 where updrs equals 'updrs_4'
    if updrs=='updrs_3':
#         rating = updrs_3_pred[month]
        rating = df[df.visit_id == visit_id]['pred2'].values[0]
    elif updrs=='updrs_4':
        rating = 0
    elif updrs =='updrs_1':
        rating = df[df.visit_id == visit_id]['pred0'].values[0]
    else:
        rating = df[df.visit_id == visit_id]['pred1'].values[0]
    return rating
df = test[['visit_id']].drop_duplicates('visit_id')
sample_submission['prediction_id'].apply(map_test)

In [None]:
pred

In [None]:
test

In [None]:
sample_submission

In [None]:
# Test computation 
env = amp_pd_peptide.make_env()   # initialize the environment
amp_pd_peptide.make_env.func_dict['__called__'] = False
iter_test = env.iter_test()    # an iterator which loops over the test files

# The API will deliver four dataframes in this specific order:
for (xtest_clinical, xtest_peptides, xtest_proteins, xtest_sample_submission) in iter_test:
    
    #import data test and concatenate
    #rint(xtest_clinical.drop_duplicates('visit_id'))
    xt_= concat_data(xtest_proteins, xtest_peptides, xtest_clinical.drop_duplicates('visit_id'))
   #print(xt_)
    xt = xt_[col_pred]
    
    pred = rf.predict(xt)
    pred = pd.DataFrame(pred, columns = targets)
    print(pred)
    #rint(xtest_sample_submission)
    
    
    
    
    #For testing purpose, I am rating all the values as 5
    #test_sample_submission['rating'] = 5
    
    #call the env.predict for every iteration
    env.predict(pred)


In [None]:
test