In [128]:
import pandas as pd
file = 'I-SPY_1_All_Patient_Clinical_and_Outcome_Data.xlsx'

# load and set index
ISPY = pd.read_excel(file, sheetname='predictors')
ISPY = ISPY.set_index('SUBJECTID')

# drop Columns I don't need
ISPY.drop(['DataExtractDt','Her2MostPos','HR_HER2_CATEGORY','HR_HER2_STATUS'], axis=1,inplace=True)

#encode race and drop initial variable
ISPY = ISPY.join(pd.get_dummies(ISPY['race_id'], prefix=['Race']))
ISPY.drop(['race_id'], axis=1,inplace=True)

# load and set index of predictors
outcomes_df = pd.read_excel(file, sheetname='outcomes')
outcomes_df = outcomes_df.set_index('SUBJECTID')

#merge PCR and predictors using the Subject ID index
df = ISPY.join(outcomes_df['PCR'])
# drop NaN
df.dropna(inplace=True)

In [129]:
y = df.iloc[:,-1].values
y

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,
        0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,
        1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,
        0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0

In [130]:
#Logistic Regression
# Modules
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

# define X and Y
X = df.iloc[:,0:15].values
y = df.iloc[:,-1].values

# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)

# check the accuracy without cross validation
model.score(X, y)

0.79005524861878451

In [170]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
# allocate and train
model2 = LogisticRegression()
model2.fit(X_train, y_train)

# predict outcomes for the test set
predicted = model2.predict(X_test)

# calculate class probabilities
probs = model2.predict_proba(X_test)

# generate evaluation metrics
# accurracy
print(20*"--")
print('The accuracy is: ')
print(metrics.accuracy_score(y_test, predicted)*100)

# AUC
print(20*"--")
print('The AUC is: ')
print(metrics.roc_auc_score(y_test, probs[:, 1]))

# confusion matrix
print(20*"--")
print('The confusion matrix is: ')
print(metrics.confusion_matrix(y_test, predicted))

# evaluate the model using leave-one-out  cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print(20*"--")
print('The leave-one-out  accuracy is: ')
print(scores.mean()*100)

----------------------------------------
The accuracy is: 
78.9473684211
----------------------------------------
The AUC is: 
0.914285714286
----------------------------------------
The confusion matrix is: 
[[13  1]
 [ 3  2]]
----------------------------------------
The leave-one-out  accuracy is: 
77.6694186447
