In [None]:
!pip install fancyimpute

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29880 sha256=bb3959943a8acb6c391aeb8d8a39500d2f7da54db5a54310e65059d4df89a037
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-none-

In [None]:
# Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import ttest_ind
from fancyimpute import SoftImpute

In [None]:
# Access path
path_train_proteins = '/Kaggle/data/train_proteins.csv'
path_train_peptides = '/Kaggle/data/train_peptides.csv'
path_train_clinical_data = '/Kaggle/data/train_clinical_data.csv'

In [None]:
# Read CSV files from a DataFrame
protein_data = pd.read_csv(path_train_proteins)
peptide_data = pd.read_csv(path_train_peptides)
clinical_data = pd.read_csv(path_train_clinical_data)

# Set visit_id as index for all dataframes
clinical_data.set_index('visit_id', inplace=True)
protein_data.set_index('visit_id', inplace=True)
peptide_data.set_index('visit_id', inplace=True)


In [None]:
petient_list = list(clinical_data["patient_id"].unique())
uniprot_list = list(protein_data["UniProt"].unique())
peptide_list = list(peptide_data["Peptide"].unique())
clinical_list = list(clinical_data["upd23b_clinical_state_on_medication"].unique())

print("# of unique patient Ids:", len(petient_list))
print("# of unique Uniprot (protein) Ids:", len(uniprot_list))
print("# of unique Peptide Ids:", len(peptide_list))


# of unique patient Ids: 248
# of unique Uniprot (protein) Ids: 227
# of unique Peptide Ids: 968


In [None]:
def normalize_column(df, column_name, method='minmax'):
    """
    Normalize a specific column in a dataframe using Min-Max normalization or Z-score normalization.

    Parameters:
    - df: the input dataframe
    - column_name: the name of the column to be normalized
    - method: the normalization method ('minmax' or 'zscore')

    Returns:
    - df: dataframe with the normalized column
    """
    if method == 'minmax':
        scaler = MinMaxScaler()
        df[column_name] = scaler.fit_transform(df[[column_name]])

    elif method == 'zscore':
        scaler = StandardScaler()
        df[column_name + '_standardized'] = scaler.fit_transform(df[[column_name]])

    else:
        raise ValueError("Method should be either 'minmax' or 'zscore'")

    return df

In [None]:
normalized_protein = normalize_column(protein_data, 'NPX', method ='minmax')
normalized_peptide = normalize_column(peptide_data, 'PeptideAbundance', method = 'minmax')

# Merging protein with petides data and the column medication of clinical data

In [None]:
# Function to prepare dataset with all the steps mentioned above:
def prepare_dataset(normalized_protein, normalized_peptide):
    # Step 1: Grouping
    df_protein_grouped = protein_data.groupby(['visit_id','UniProt'])['NPX'].mean().reset_index()
    df_peptide_grouped = peptide_data.groupby(['visit_id','Peptide'])['PeptideAbundance'].mean().reset_index()

    # Step 2: Pivoting
    df_protein = df_protein_grouped.pivot(index='visit_id',columns = 'UniProt', values = 'NPX').rename_axis(columns=None).reset_index()
    df_peptide = df_peptide_grouped.pivot(index='visit_id',columns = 'Peptide', values = 'PeptideAbundance').rename_axis(columns=None).reset_index()

    # Step 3: Merging
    pro_pep_df = df_protein.merge(df_peptide, on = ['visit_id'], how = 'left')

    return pro_pep_df



In [None]:
clean_data = prepare_dataset(normalized_protein,normalized_peptide)

In [None]:
clean_data

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,10053_0,0.000015,0.000655,,,0.000012,0.000004,0.000135,0.000024,0.000272,...,0.001132,,0.024625,0.000433,0.003262,0.000429,0.000583,0.002966,,0.000040
1,10053_12,0.000017,0.000709,,,,,0.000321,0.000024,0.000267,...,0.001124,,0.027981,0.000206,0.001990,0.000515,0.000689,0.002539,0.000276,0.000142
2,10053_18,0.000021,0.000826,0.000011,0.000040,,0.000004,0.000206,0.000026,0.000274,...,0.001235,,0.030346,0.000218,0.002775,0.000354,0.000718,0.002503,0.000293,0.000119
3,10138_12,0.000020,0.000806,0.000015,0.000044,0.000037,0.000010,0.000255,0.000089,0.000332,...,0.001054,0.000053,0.021819,0.000270,0.001838,0.000502,0.000727,0.003089,0.000367,0.000055
4,10138_24,0.000019,0.000850,0.000007,0.000028,0.000047,0.000004,0.000246,0.000085,0.000392,...,0.001153,0.000036,0.019702,0.000391,0.002779,0.000453,0.000625,,0.000319,0.000027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,0.000016,0.000652,0.000039,,0.000027,0.000010,0.000126,0.000076,0.000414,...,0.001622,0.000048,0.049065,0.000188,0.005181,0.000665,0.000748,0.003199,0.000449,0.000307
1109,942_12,0.000011,0.000588,0.000030,0.000024,0.000030,0.000003,0.000141,0.000061,0.000345,...,0.000969,0.000027,0.002094,0.000200,0.001401,0.000369,0.000436,0.002720,0.000252,
1110,942_24,,0.000574,0.000037,0.000038,0.000027,0.000002,0.000187,0.000059,0.000303,...,0.001037,0.000031,,0.000358,0.002682,0.000383,0.000417,0.003141,0.000296,0.000122
1111,942_48,0.000019,0.000410,0.000036,0.000043,0.000036,0.000003,0.000134,0.000049,0.000273,...,0.000770,0.000035,,0.000157,0.001294,0.000354,0.000361,0.003540,0.000286,0.000116


# Checking type of missing data and imputing

In [None]:
def determine_missingness_type(df, column):
    missing = df[df[column].isnull()]
    not_missing = df.dropna(subset=[column])

    if len(missing) < 30:
        return 'MCAR'

    p_values = []
    for col in df.columns:
        if df[col].dtype != 'object' and col != column:
            t_stat, p_val = ttest_ind(missing[col], not_missing[col], nan_policy='omit')
            p_values.append(p_val)

    if any(p < 0.05 for p in p_values):
        return 'MAR or MNAR'
    else:
        return 'MCAR'

def impute_mcar(df, column):
    avg = df[column].mean()
    df[column].fillna(avg, inplace=True)

def impute_mar_mnar(df):
    # Temporarily store and remove the visit_id column
    visit_ids = df['visit_id']
    df_temp = df.drop(columns=['visit_id'])

    # Perform imputation on the remaining columns
    imputer = SoftImpute()
    df_filled = imputer.fit_transform(df_temp)

    # Convert the result back to a DataFrame and reattach the visit_id column
    df_filled = pd.DataFrame(df_filled, columns=df_temp.columns)
    df_filled['visit_id'] = visit_ids
    return df_filled

def handle_nans(df):
    for col in df.columns:
        if df[col].isnull().any():
            missing_type = determine_missingness_type(df, col)
            if missing_type == 'MCAR':
                impute_mcar(df, col)
            else:
                df = impute_mar_mnar(df)
    return df


In [None]:
features_df = handle_nans(clean_data)

[SoftImpute] Max Singular Value of X_init = 25.597482
[SoftImpute] Iter 1: observed MAE=0.000506 rank=13
[SoftImpute] Iter 2: observed MAE=0.000503 rank=11
[SoftImpute] Iter 3: observed MAE=0.000505 rank=11
[SoftImpute] Iter 4: observed MAE=0.000505 rank=11
[SoftImpute] Iter 5: observed MAE=0.000506 rank=11
[SoftImpute] Iter 6: observed MAE=0.000506 rank=11
[SoftImpute] Iter 7: observed MAE=0.000506 rank=11
[SoftImpute] Iter 8: observed MAE=0.000506 rank=11
[SoftImpute] Iter 9: observed MAE=0.000506 rank=11
[SoftImpute] Iter 10: observed MAE=0.000506 rank=11
[SoftImpute] Iter 11: observed MAE=0.000506 rank=11
[SoftImpute] Iter 12: observed MAE=0.000506 rank=11
[SoftImpute] Iter 13: observed MAE=0.000506 rank=11
[SoftImpute] Iter 14: observed MAE=0.000506 rank=11
[SoftImpute] Iter 15: observed MAE=0.000506 rank=11
[SoftImpute] Iter 16: observed MAE=0.000506 rank=11
[SoftImpute] Iter 17: observed MAE=0.000506 rank=11
[SoftImpute] Iter 18: observed MAE=0.000506 rank=11
[SoftImpute] Iter 1

In [None]:
# Reorder columns to have 'visit_id' as the first column
cols = ['visit_id'] + [col for col in features_df if col != 'visit_id']
features_df = features_df[cols]

# Normalize the target data

In [None]:
# Create a new dataframe with selected columns
updrs_df = clinical_data[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']]

In [None]:
# Normalize each UPDRS column using the 'minMax' method
for column in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
    updrs_df = normalize_column(updrs_df, column, method='minmax')
updrs_df.reset_index(inplace=True)
print(updrs_df.head())


  visit_id   updrs_1  updrs_2   updrs_3  updrs_4
0     55_0  0.303030    0.150  0.174419      NaN
1     55_3  0.303030    0.175  0.290698      NaN
2     55_6  0.242424    0.250  0.395349      NaN
3     55_9  0.242424    0.225  0.348837      0.0
4    55_12  0.303030    0.250  0.476744      0.0


In [None]:
# Performing an inner join based on 'visit_id'
features_data = features_df.merge(updrs_df[['visit_id']], on='visit_id', how='inner')

# Extracting the targets for these common visit_ids
targets_df = updrs_df[updrs_df['visit_id'].isin(features_data['visit_id'])]

# Printing the shapes of the resulting DataFrames
print(features_data.shape, targets_df.shape)


(1068, 1196) (1068, 5)


In [None]:
SAVING_PATH = '/Kaggle/data'

In [None]:
features_data.to_csv(f'{SAVING_PATH}/features_data.csv',index=False)

In [None]:
targets_df.to_csv(f'{SAVING_PATH}/targets_data.csv',index=False)