In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Fucntions usefuls

In [None]:
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
import pandas as pd 


def effectif_int(data):
    df = data.select_dtypes(include = ['int','float'])
    df = df.describe().T
    df.drop(columns='count', inplace=True)
    return df 

def type_of_vars(data):
    typ_var = pd.DataFrame(data.dtypes)
    typ_var.columns = ['Types variables']
    return typ_var

 

# class to plot missing values, convert types of variables 

class Descriptive_analysis():
  """ 
    Class to compute all types of descriptives analysis steps 
  """
  def __init__(self,
              objective:str = 'survival') -> None:
    self.objective = objective

  def missing_values(self,
                  X:pd.DataFrame):
    """ Compute missing values df """

    self.X = X

    #compute df of missing values sorted 
    self.miss = pd.DataFrame(X.isnull().sum())
    self.miss.columns = ['Nans']
    self.miss = self.miss.sort_values( by= ['Nans'], ascending= False)
    self.miss = self.miss[self.miss.Nans != 0]
    self.miss.reset_index(inplace=True)
    self.miss = self.miss.rename(columns = {'index': 'Variables'})
    if len(self.miss) == 0:
        print('There is no missing values')
    else :
        return self.miss
  

  def plot_missing_values(self, X:pd.DataFrame,
                              fig_size:list=[20,7],
                              size_police: int= 10,
                              threshold:float =None):
    """" Plot missing values bar"""
    self.X = X 
    self.fig_size = fig_size
    self.size_police = size_police
    self.threshold = threshold

    if self.threshold == None:
      self.threshold = np.round(X.shape[0]/2, 2)

    #compute df for missing values
    self.miss = self.missing_values(self.X)

    #ploting features

    plt.figure(figsize=(self.fig_size[0], self.fig_size[1]))
    g = sns.barplot(x="Variables", y="Nans", data=self.miss[self.miss.Nans > self.threshold])
    total = len(self.X)
    for p in g.patches:
      percentage = '{:.1f}%'.format(100 * p.get_height()/total)
      x = p.get_x() + p.get_width() / 2 - 0.05
      y = p.get_y() + p.get_height()
      g.annotate(percentage, (x, y), size = 10)
    plt.title('Variables with more than ' + str(self.threshold)+ '% missing values' )
    plt.show()


  def define_dtypes(self, X:pd.DataFrame, 
                          category_columns:list=None,
                          str_columns:list=None,
                          date_columns:list=None,
                          format_date:str=None,
                          num_columns:str=None):
    """ Convert variables """
    self.X = X
    self.category_columns = category_columns
    self.str_columns = str_columns
    self.date_columns = date_columns
    self.format_date = format_date
    self.num_columns = num_columns

    self.X[self.category_columns] = self.X[self.category_columns].astype('category')
    self.X[self.str_columns] = self.X[self.str_columns].astype('str')
    # date convertion
    for date_c in self.date_columns:
      self.X[date_c] = pd.to_datetime(self.X[date_c], format=self.format_date , errors='coerce')
    if self.num_columns != None : 
      self.X[self.num_columns] = self.X[self.num_columns].astype('float')
    else : 
      list_other_variables = list(set(self.X.columns)- set(self.category_columns))
      list_other_variables = list(set(list_other_variables)- set(self.str_columns))
      list_other_variables = list(set(list_other_variables)- set(self.date_columns))
      self.X[list_other_variables] = self.X[list_other_variables].astype('float')
    
    return self.X

def concat_data(train_proteins, train_peptides, train_clinical):
    # merge proteins and peptides 
    train_r = pd.merge(train_peptides, train_proteins, how='left', on=['visit_id','visit_month','patient_id','UniProt'])
    # merge clinical 
    train_r = pd.merge(train_r, train_clinical, how='left', on=['visit_id','visit_month','patient_id'])
    return train_r

# metrics 
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap) 


def mean_smape(y_true, y_pred):
    res = 0
    for i in range(y_true.shape[1]):
        res = res + smape(y_true.iloc[:,i], y_pred.iloc[:,1])
    return (res / 4) 

# Introduction 

In this part :
- we will see how we can apply non-parametric statistical test on data
- create customs function sklearn for imputation (miceforest)
- build baseline model (with and without protein/peptides)

In [None]:
# importing datasets
import numpy as np
import pandas as pd
sample_submission = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv')
train_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')
train_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')
train_clinical = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')
sup = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')

In [None]:
data = [train_proteins, train_peptides, train_clinical, sup]
data_name = ['train_proteins', 'train_peptides', 'train_clinical', 'sup']
for i in range(len(data)):
    print('Shape of ',data_name[i] ,data[i].shape)

In [None]:
for i in range(len(data)):
    print(data_name[i])
    print('')
    print(data[i].head())
    print('---------------------------')
    print('')

In [None]:
#concatenate datasets 
train = concat_data(train_proteins, train_peptides, train_clinical)
print(train.shape)
train.head()

In [None]:
# missing values 
descrip = Descriptive_analysis()
descrip.missing_values(train)  

In [None]:
descrip.plot_missing_values(train, threshold=0, fig_size=[12,6])

In [None]:
t