In [None]:
%matplotlib inline
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Load Data

In [None]:
data_dir = '/Users/nora/Box/dssg/DrivenData/Heart/data/'

In [None]:
test = pd.read_csv(data_dir + 'test_values.csv')
train = pd.read_csv(data_dir + 'train_values.csv')
labels = pd.read_csv(data_dir + 'train_labels.csv')

In [None]:
npatients = len(train)

# Prepare Data

In [None]:
def convert_columns(df, cols=[]):
    for col in cols:
        df[col] = df[col].astype(str)
    return df
        
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df

def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)
    
    return df

## Prepare Train

In [None]:
patient_id = train['patient_id'].as_matrix()
train = train.drop('patient_id', axis=1)
train = convert_columns(train, cols=['chest_pain_type', 'resting_ekg_results'])
train = pre_process_data(train)

In [None]:
train.head()

## Prepare Labels

In [None]:
labels = np.ravel(labels.drop('patient_id', axis=1))

## Prepare Test

In [None]:
test_patient_id = test['patient_id'].as_matrix()
test = test.drop('patient_id', axis=1)
test = convert_columns(test, cols=['chest_pain_type', 'resting_ekg_results'])
test = pre_process_data(test)

## Split Data

In [None]:
# cutoff = int(npatients*0.75)
# train1 = train.iloc[:cutoff]
# train2 = train.iloc[cutoff:]
# labels1 = labels[:cutoff]
# labels2 = labels[cutoff:]
# print train1.shape, train2.shape

In [None]:
train1, train2, labels1, labels2 = train_test_split(train, labels, train_size=0.75, test_size=0.25, shuffle=True)

# Prepare Submission

In [None]:
def prepare_submission(model, test, fname='submission.csv'):
    probs = model.predict_proba(test)
    sub = pd.DataFrame(np.vstack([test_patient_id, probs[:,1]]).T)
    sub.to_csv(fname, sep=',', header=['patient_id','heart_disease_present'], index=False)

In [None]:
def get_score(model, test, labels, printing=True):
    probs = model.predict_proba(test)
    preds = model.predict(test)
    if printing:
        print 'Accuracy: %.2f' %(model.score(test, labels)) # %(len(np.where(preds == labels))/len(preds))
        print 'Log loss: %.2f' %(log_loss(labels, probs))
    return log_loss(labels, probs)

# Extra Trees Classifier

In [None]:
# random forests are often a good model to try first, especially when we have numeric and categorical variables in our feature space.
def train_etc(features, labels, **kwargs):
    
    # instantiate model
    # model = RandomForestClassifier(n_estimators=50, random_state=0)
    model = ExtraTreesClassifier(n_estimators=50, random_state=0)
    
    # train model
    model.fit(features, labels)
    
    # get a (not-very-useful) sense of performance
    accuracy = model.score(features, labels)
    # print(f"In-sample accuracy: {accuracy:0.2%}")
    # print("In-sample accuracy: %.2f percent" %(accuracy*100))
    
    return model

In [None]:
model_a = train_etc(train1, labels1)

In [None]:
preds = model_a.predict(train2)
probs = model_a.predict_proba(train2)

In [None]:
print 'Accuracy: %.2f' %(len(np.where(preds == labels2))/len(preds))
print 'Log loss: %.2f' %(log_loss(labels2, probs))

In [None]:
model_b = train_etc(train, labels)

In [None]:
probs = model_b.predict_proba(test)
preds = model_b.predict(test)

In [None]:
sub = pd.DataFrame(np.vstack([test_patient_id, probs[:,1]]).T)

In [None]:
sub.to_csv('submission1.csv', sep=',', header=['patient_id','heart_disease_present'], index=False)

In [None]:
prepare_submission(model_b, test, 'submission.csv')

# Random Forest Classifier

In [None]:
# random forests are often a good model to try first, especially when we have numeric and categorical variables in our feature space.
def train_rfc(features, labels, **kwargs):
    
    # instantiate model
    model = RandomForestClassifier(n_estimators=50, random_state=0)
    
    # train model
    model.fit(features, labels)
    
    # get a (not-very-useful) sense of performance
    accuracy = model.score(features, labels)
    # print(f"In-sample accuracy: {accuracy:0.2%}")
    # print("In-sample accuracy: %.2f percent" %(accuracy*100))
    
    return model

In [None]:
model = train_rfc(train1, labels1)

In [None]:
_ = get_score(model, train2, labels2)

# Recursive Feature Elimination

In [None]:
rfc = RandomForestClassifier(n_estimators=50, random_state=12)
rfe = RFE(rfc, n_features_to_select=13)
rfe = rfe.fit(train, labels)

In [None]:
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
idx_rank = np.argsort(rfe.ranking_)

In [None]:
idx = np.where(rfe.support_==False)[0]
cols = train.keys()
cols[idx]

In [None]:
for i, c in enumerate(cols[idx_rank]):
    print rfe.ranking_[idx_rank][i], c

In [None]:
train.head()

In [None]:
train.loc[:, rfe.support_].head()

In [None]:
model = train_rfc(train1.loc[:, rfe.support_], labels1)

In [None]:
get_score(model, train2.loc[:, rfe.support_], labels2)

In [None]:
model = train_rfc(train.loc[:, rfe.support_], labels)

In [None]:
prepare_submission(model, test.loc[:, rfe.support_], 'submission4.csv')

In [None]:
llmin = np.inf
rfc = RandomForestClassifier(n_estimators=50, random_state=12)
for n_feat in range(1,train.shape[1]+1):
    print
    rfe = RFE(rfc, n_features_to_select=n_feat)
    rfe = rfe.fit(train, labels)
    model = train_rfc(train1.loc[:, rfe.support_], labels1)
    logloss = get_score(model, train2.loc[:, rfe.support_], labels2)
    if logloss < llmin:
        llmin = logloss
        print 'New best score with n_feat = %i' %n_feat

In [None]:
llmin = np.inf
rfc = RandomForestClassifier(n_estimators=50, random_state=12)
for n_feat in range(1,train.shape[1]+1):
    print 'n_feat = %i' %n_feat
    score = 0.
    for i in range(10):
        # print score
        train1, train2, labels1, labels2 = train_test_split(train, labels, train_size=0.75, test_size=0.25, shuffle=True)
        # print
        rfe = RFE(rfc, n_features_to_select=n_feat)
        rfe = rfe.fit(train1, labels1)
        model = train_rfc(train1.loc[:, rfe.support_], labels1)
        logloss = get_score(model, train2.loc[:, rfe.support_], labels2, printing=False)
        score += logloss
    print score/11.
    if score < llmin:
        llmin = score
        print 'New best score with n_feat = %i' %n_feat

# TSNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
X = TSNE(n_components=2).fit_transform(train)

In [None]:
plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', edgecolor='none', vmax=1.5, s=30)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()
params = {'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}

In [None]:
from sklearn.model_selection import GridSearchCV
logreg_gs = GridSearchCV(logreg, params, cv=10, return_train_score=True)
logreg_gs.fit(train, labels)

In [None]:
print 'Best params: ', logreg_gs.best_params_
best_lr = logreg_gs.best_estimator_
coefs = best_lr.coef_
print 'Best number of features: ', coefs.size
print 'Number of selected features: %i' %np.count_nonzero(coefs)

In [None]:
print 'Best accuracy: %.2f' %(logreg_gs.best_score_ * 100)
print 'Best parameters: ', logreg_gs.best_params_

In [None]:
prepare_submission(logreg_gs, test, fname='submission_logreg_gs.csv')