# Task 3. Splitting database into training and testing sets.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from utils import *

In [2]:
master_df = pd.read_csv(parameters['master_augmented_2'], sep=',')
master_df.head()

Unnamed: 0,patient,5:195139,5:336952,5:389603,5:851582,5:1144802,5:1167618,5:1175892,5:1398007,5:1447860,...,22:49363742,22:49578486,22:49651708,22:49666841,22:49797810,22:49855674,22:50021013,22:50258751,22:50458664,superpopulation_code
0,NA19625,0,0,1,1,1,0,0,1,1,...,0,0,1,1,0,1,0,1,0,AFR
1,NA19835,0,0,1,1,1,0,0,0,0,...,0,0,0,1,1,1,0,1,0,AFR
2,NA19900,0,0,1,0,1,0,0,0,0,...,1,0,0,1,1,1,0,0,0,AFR
3,NA19917,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,1,0,1,0,AFR
4,NA19703,0,1,1,0,1,0,0,1,1,...,0,0,1,1,0,1,0,0,0,AFR


In [3]:
tsv = pd.read_csv(parameters['tsv_file'], sep='\t')
tsv['Superpopulation name'].unique()

array(['African Ancestry', 'South Asian Ancestry',
       'South Asia (SGDP),South Asian Ancestry', 'European Ancestry',
       'European Ancestry,West Eurasia (SGDP)', 'American Ancestry',
       'East Asian Ancestry', 'African Ancestry,Africa (SGDP)',
       'East Asia (SGDP),East Asian Ancestry'], dtype=object)

In [4]:
tsv['Superpopulation code'].unique()

array(['AFR', 'SAS', 'EUR', 'AMR', 'EAS'], dtype=object)

In [5]:
# Drop 'patient' column 
master_df.drop(columns=['patient'], inplace=True)
master_df.head()

Unnamed: 0,5:195139,5:336952,5:389603,5:851582,5:1144802,5:1167618,5:1175892,5:1398007,5:1447860,5:1721485,...,22:49363742,22:49578486,22:49651708,22:49666841,22:49797810,22:49855674,22:50021013,22:50258751,22:50458664,superpopulation_code
0,0,0,1,1,1,0,0,1,1,1,...,0,0,1,1,0,1,0,1,0,AFR
1,0,0,1,1,1,0,0,0,0,1,...,0,0,0,1,1,1,0,1,0,AFR
2,0,0,1,0,1,0,0,0,0,0,...,1,0,0,1,1,1,0,0,0,AFR
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,AFR
4,0,1,1,0,1,0,0,1,1,1,...,0,0,1,1,0,1,0,0,0,AFR


In [6]:
# Drop the columns that have the value of AMR in the superpopulation_code column
master_clean = master_df[master_df['superpopulation_code'] != 'AMR']
print(f"Length of master_df: {len(master_df)}\nLength of master_clean: {len(master_clean)}")

Length of master_df: 2504
Length of master_clean: 2157


In [7]:
# Dataframe only with data with AFR
afr_df = master_clean[master_clean['superpopulation_code'] == 'AFR']
# Dataframe only with data with EUR
eur_df = master_clean[master_clean['superpopulation_code'] == 'EUR']
# Dataframe only with data with SAS and EAS
sas_eas_df = master_clean[master_clean['superpopulation_code'] == 'SAS']
sas_eas_df = sas_eas_df.append(master_clean[master_clean['superpopulation_code'] == 'EAS'])

print(f"Length of afr_df: {len(afr_df)}\nLength of eur_df: {len(eur_df)}\nLength of sas_eas_df: {len(sas_eas_df)}")

Length of afr_df: 661
Length of eur_df: 503
Length of sas_eas_df: 993


  sas_eas_df = sas_eas_df.append(master_clean[master_clean['superpopulation_code'] == 'EAS'])


### AFR model splitting

In [8]:
# Function that receives a dataframe and returns a dataframe with the target variable removed and train and test split
def split_dataframe(df, target_variable):
    # Get X and y from the dataframe
    X = df.copy(deep=True)
    y = X[target_variable]
    X.drop(target_variable, axis=1, inplace=True)

    # Split the dataframe into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    return X_train, X_test, y_train, y_test

In [9]:
X_afr_train, X_afr_test, y_afr_train, y_afr_test = split_dataframe(afr_df, 'superpopulation_code')
print(f"""
    X_afr_train: {X_afr_train.shape}, X_afr_test: {X_afr_test.shape}\n
    y_afr_train: {y_afr_train.shape}, y_afr_test: {y_afr_test.shape}
    """)


    X_afr_train: (528, 10028), X_afr_test: (133, 10028)

    y_afr_train: (528,), y_afr_test: (133,)
    


### EUR model splitting

In [10]:
X_eur_train, X_eur_test, y_eur_train, y_eur_test = split_dataframe(eur_df, 'superpopulation_code')
print(f"""
    X_eur_train: {X_eur_train.shape}, X_eur_test: {X_eur_test.shape}\n
    y_eur_train: {y_eur_train.shape}, y_eur_test: {y_eur_test.shape}
    """)


    X_eur_train: (402, 10028), X_eur_test: (101, 10028)

    y_eur_train: (402,), y_eur_test: (101,)
    


### SAS and EAS model splitting

In [11]:
X_sas_eas_train, X_sas_eas_test, y_sas_eas_train, y_sas_eas_test = split_dataframe(sas_eas_df, 'superpopulation_code')
print(f"""
    X_sas_eas_train: {X_sas_eas_train.shape}, X_sas_eas_test: {X_sas_eas_test.shape}\n
    y_sas_eas_train: {y_sas_eas_train.shape}, y_sas_eas_test: {y_sas_eas_test.shape}
    """)


    X_sas_eas_train: (794, 10028), X_sas_eas_test: (199, 10028)

    y_sas_eas_train: (794,), y_sas_eas_test: (199,)
    


---

# Task 4. Training a model for each ancestry

In [12]:
# Function that generates a model and returns the model trained
def generate_model(X_train, y_train):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    return model

### AFR model training

In [13]:
afr_model = generate_model(X_afr_train, y_afr_train)
# Print the accuracy of the model
print(f"Accuracy of the AFR model: {accuracy_score(y_afr_test, afr_model.predict(X_afr_test))}")

Accuracy of the AFR model: 1.0


### EUR model training

In [14]:
eur_model = generate_model(X_eur_train, y_eur_train)
# Print the accuracy of the model
print(f"Accuracy of the EUR model: {accuracy_score(y_eur_test, eur_model.predict(X_eur_test))}")

Accuracy of the EUR model: 1.0


### SAS and EAS model training

In [15]:
sas_eas_model = generate_model(X_sas_eas_train, y_sas_eas_train)
# Print the accuracy of the model
print(f"Accuracy of the SAS/EAS model: {accuracy_score(y_sas_eas_test, sas_eas_model.predict(X_sas_eas_test))}")

Accuracy of the SAS/EAS model: 0.9447236180904522


---

In [16]:
# Evaluate the models with a 10-fold cross validation
from sklearn.model_selection import cross_val_score

In [17]:
afr_accuracy = cross_val_score(afr_model, X_afr_train, y_afr_train, cv=10).mean()
print(f"Accuracy of the AFR model: {afr_accuracy}")

Accuracy of the AFR model: 1.0


In [18]:
eur_accuracy = cross_val_score(eur_model, X_eur_train, y_eur_train, cv=10).mean()
print(f"Accuracy of the EUR model: {eur_accuracy}")

Accuracy of the EUR model: 1.0


In [19]:
sas_eas_accuracy = cross_val_score(sas_eas_model, X_sas_eas_train, y_sas_eas_train, cv=10).mean()
print(f"Accuracy of the SAS/EAS model: {sas_eas_accuracy}")

Accuracy of the SAS/EAS model: 0.9470411392405064
