In [5]:
# Build a ML model from real or simulated genetic data (we have simulated data you can start with)
# Systematically change the genotypes of the genetic variants (i.e. the features) one at a time (genotypes are coded 0,1,2) for each subject in the data set
# Plot the change in risk for each genotype/variant and subject

# This would require you writing a script to run the ML model after changing each genotype.

# The only wrinkle is that we should use a probability machine approach that uses a regression function with 
# the ML method such that the output is a probability instead of a 1 (disease) and 0 (healthy) outcome.


# Import required libraries
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random

# Load breast cancer data
bcdata = pd.read_csv('breastCancer29.csv')

# store in two dfs the cases and controls
dataset_all_cases = bcdata.loc[bcdata.phenotype == 0, :]
dataset_all_controls = bcdata.loc[bcdata.phenotype == 1, :]
all_cases = dataset_all_cases.reset_index(drop=True)
all_controls = dataset_all_controls.reset_index(drop=True)

# Select 50 random numbers in both case and controls
random.seed(12345)
random_cases_indicies = random.sample(range(len(all_cases)), 50)            
random_controls_indicies = random.sample(range(len(all_controls)), 50)

fifty_cases = all_cases.loc[random_cases_indicies, :]            # Get 50 random samples from case array
rest_of_cases = all_cases.drop(random_cases_indicies)

fifty_controls = all_controls.loc[random_controls_indicies, :]   # Get 50 random samples from controls array
rest_of_controls = all_controls.drop(random_controls_indicies)

# Resulting 100 balanced random rows (50 case/50 control) as precision medicine test dataset
pre_med_data = pd.concat([fifty_cases, fifty_controls]).reset_index(drop=True)

test_validate_data = bcdata.drop(pre_med_data.index).reset_index(drop=True)

# Now we have two datasets from original 'breastCancer29.csv' 
# --> pre_med_data (100 participants) & test_validate_data (2405 participants)

In [6]:
pre_med_data.to_csv('premed.csv', index = False)
test_validate_data.to_csv('test_validate_data.csv', index = False) # Split into X/Y train and X/Y test

In [7]:
# Train & validate on the test_validate_data = (bcdata - precision medicine dataset) 
Xdata = test_validate_data.loc[:, bcdata.columns != 'phenotype']
Ydata = test_validate_data['phenotype']
X_train, X_test, Y_train, Y_test = train_test_split(Xdata, Ydata, random_state=42,
                                                    train_size=0.75, test_size=0.25)

In [8]:
tpot = TPOTClassifier(generations=100, population_size=100, verbosity=2, template = 'Classifier', max_time_mins=15, early_stop=5)
tpot.fit(X_train, Y_train)
print(tpot.score(X_test, Y_test))

HBox(children=(IntProgress(value=0, description='Optimization Progress'), HTML(value='')))

Generation 1 - Current best internal CV score: 0.5568467220683286
Generation 2 - Current best internal CV score: 0.5590704832256079
Generation 3 - Current best internal CV score: 0.5590735610957218
Generation 4 - Current best internal CV score: 0.5590735610957218
Generation 5 - Current best internal CV score: 0.5596198830409357
Generation 6 - Current best internal CV score: 0.5596198830409357
Generation 7 - Current best internal CV score: 0.5596198830409357

15.073534266666666 minutes have elapsed. TPOT will close down.
TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.1, min_samples_leaf=18, min_samples_split=20, n_estimators=100)
0.5524126455906821


In [9]:
tpot.export('bcdata_pipeline.py')


True