In [15]:
# Build a ML model from real or simulated genetic data (we have simulated data you can start with)
# Systematically change the genotypes of the genetic variants (i.e. the features) one at a time (genotypes are coded 0,1,2) for each subject in the data set
# Plot the change in risk for each genotype/variant and subject

# This would require you writing a script to run the ML model after changing each genotype.

# The only wrinkle is that we should use a probability machine approach that uses a regression function with 
# the ML method such that the output is a probability instead of a 1 (disease) and 0 (healthy) outcome.


# Import required libraries
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random

# Load breast cancer data
bcdata = pd.read_csv('breastCancer29.csv')

# store in two dfs the cases and controls
dataset_all_cases = bcdata.loc[bcdata.phenotype == 0, :]
dataset_all_controls = bcdata.loc[bcdata.phenotype == 1, :]
all_cases = dataset_all_cases.reset_index(drop=True)
all_controls = dataset_all_controls.reset_index(drop=True)

# Select 50 random numbers in both case and controls
random_cases_indicies = random.sample(range(len(all_cases)), 50)            
random_controls_indicies = random.sample(range(len(all_controls)), 50)

fifty_cases = all_cases.loc[random_cases_indicies, :]            # Get 50 random samples from case array
rest_of_cases = all_cases.drop(random_cases_indicies)

fifty_controls = all_controls.loc[random_controls_indicies, :]   # Get 50 random samples from controls array
rest_of_controls = all_controls.drop(random_controls_indicies)

# Resulting 100 balanced random rows (50 case/50 control) as precision medicine test dataset
pre_med_data = pd.concat([fifty_cases, fifty_controls])

test_validate_data = bcdata.drop(pre_med_data.index).reset_index(drop=True)

# Now we have two datasets from original 'breastCancer29.csv' --> 100 aside & rest

Unnamed: 0,rs616488,rs11249433,rs4849887,rs2016394,rs1550623,rs6762644,rs4973768,rs1053338,rs1353747,rs1432679,...,rs3817198,rs17356907,rs1292011,rs999737,rs11627032,rs13329835,rs1436904,rs3760982,rs2823093,phenotype
265,0,2,0,1,0,0,1,0,0,1,...,0,0,1,0,0,1,1,1,0,0
1180,1,2,0,1,1,1,2,1,0,1,...,1,0,0,0,0,1,1,0,1,0
842,0,1,0,1,0,1,1,0,1,1,...,0,2,1,0,1,0,1,2,2,0
930,1,1,1,2,1,1,1,0,0,2,...,0,1,0,0,0,0,0,0,0,0
813,0,0,0,0,1,1,1,1,0,0,...,1,0,0,0,0,1,0,2,0,0
162,0,0,1,0,2,2,0,1,0,0,...,1,0,0,2,1,0,2,1,0,0
793,1,2,0,1,0,1,0,1,0,0,...,1,0,2,0,1,0,0,1,2,0
381,1,1,1,1,0,1,2,0,1,1,...,0,0,0,0,0,0,1,1,0,0
1240,1,1,0,1,0,1,2,0,0,0,...,0,1,0,2,1,0,1,1,1,0
323,1,2,0,1,1,1,2,2,0,0,...,1,0,1,0,0,0,0,0,1,0


In [11]:
# Train & validate on the test_validate_data = (bcdata - precision medicine dataset) 
Xdata = test_validate_data.loc[:, bcdata.columns != 'phenotype']
Ydata = test_validate_data['phenotype']
X_train, X_test, Y_train, Y_test = train_test_split(Xdata, Ydata,
                                                    train_size=0.75, test_size=0.25)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((1804, 29), (602, 29), (1804,), (602,))

In [7]:
tpot = TPOTClassifier(generations=100, population_size=100, verbosity=2, template = 'Classifier', max_time_mins=15)
tpot.fit(X_train, Y_train)
print(tpot.score(X_test, Y_test))

HBox(children=(IntProgress(value=0, description='Optimization Progress'), HTML(value='')))

Generation 1 - Current best internal CV score: 0.5653990178023328
Generation 2 - Current best internal CV score: 0.5653990178023328
Generation 3 - Current best internal CV score: 0.5653990178023328
Generation 4 - Current best internal CV score: 0.5653990178023328
Generation 5 - Current best internal CV score: 0.5698434622467772
Generation 6 - Current best internal CV score: 0.5698434622467772
Generation 7 - Current best internal CV score: 0.5698434622467772
Generation 8 - Current best internal CV score: 0.5698434622467772
Generation 9 - Current best internal CV score: 0.5698434622467772
Generation 10 - Current best internal CV score: 0.5698434622467772
Generation 11 - Current best internal CV score: 0.5698434622467772

15.022356283333332 minutes have elapsed. TPOT will close down.
TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: LinearSVC(input_matrix, C=20.0, dual=True, loss=hinge, penalty=l2, tol=0.0001)
0.5083056478405316


In [8]:
tpot.export('bcdata_pipeline.py')
pre_med_data.to_csv('premed.csv')
test_validate_data.to_csv('test_validate_data.csv')

True