In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
X_train_resampled = pd.read_csv('X_train_smoteenn.csv')
y_train_resampled = pd.read_csv('y_train_smoteenn.csv')

X_train_resampled.head(5)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,3.0,0.0,0.0,95.12,18.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1,58.0,1.0,0.0,87.96,39.2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,8.0,0.0,0.0,110.89,17.6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,70.0,0.0,0.0,69.04,35.9,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,14.0,0.0,0.0,161.28,19.1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [3]:
y_train_resampled.head(5)

Unnamed: 0,stroke
0,0
1,0
2,0
3,0
4,0


# Scaling Variables

In [4]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
scaler.fit(X_train_resampled.as_matrix())
X_scaled = scaler.transform(X_train_resampled)

# Building Model

In [5]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from keras.callbacks import EarlyStopping

# baseline model
def create_baseline(n_features):
    # create model
    model = Sequential()
    model.add(Dense(n_features, input_dim=n_features, kernel_initializer='normal', activation='sigmoid'))
    #model.add(Dense(n_features, input_dim=n_features, kernel_initializer='normal', activation='sigmoid'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def fit_and_roc(name,x_train, y_train, x_valid, y_valid):
    # fit model
    if name == "ann":
        n_features = x_train.shape[1]
        model = create_baseline(n_features)
        callback = EarlyStopping("val_loss", patience=1, verbose=0, mode='auto')
        model.fit(
                X_train, y_train,
                nb_epoch=20, batch_size=25,
                validation_data=(X_valid, y_valid),
                callbacks=[callback], verbose = 1)
    elif name == 'svm':
        model = svm.SVC(probability=True)
        model.fit(x_train, y_train)
    elif name == 'rf':
        model = RandomForestRegressor(n_jobs=2, n_estimators=150)
        model.fit(x_train, y_train)
    
    # evaluate training error
    y_train_pred = model.predict(x_train)
    y_train_prob = model.predict_proba(x_train)[:, 0]

    fpr, tpr, thresholds = roc_curve(y_train, y_train_prob, pos_label=1)
    if name == 'svm':
        fpr = 1-fpr
        tpr = 1-tpr
    roc_auc = auc(fpr, tpr)
    
    # evaluate testing error
    y_test_pred = model.predict(x_valid)
    y_test_prob = model.predict_proba(x_valid)[:, 0]
    
    fpr_test, tpr_test, threshold_test = roc_curve(y_valid, y_test_prob, pos_label=1)
    if name == 'svm':
        fpr_test = 1-fpr_test
        tpr_test = 1-tpr_test
    roc_auc_test = auc(fpr_test, tpr_test)
    
    # plot roc
    plt.figure(figsize=(5, 5))
    plt.plot(fpr, tpr, \
             color='darkorange', lw=2, linestyle='-', label='Training ROC Curve (area = {0:.2f})'.format(roc_auc))
    plt.plot(fpr_test, tpr_test, \
             color='deeppink', lw=2, linestyle='-', label='Testing ROC Curve (area = {0:.2f})'.format(roc_auc_test))
    plt.plot([0,1], [0,1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic - Logistic Regression')
    plt.legend(loc='lower right')
    plt.show()

    return model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y_train_resampled['stroke'].values, random_state=666, train_size=0.80)



In [7]:
#NN_model = fit_and_roc('ann',X_train, y_train, X_valid, y_valid)

# XGBoost

In [8]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'roc_auc', cv = 5, n_jobs = -1) 




In [None]:
optimized_GBM.fit(X_train, y_train)

In [None]:
optimized_GBM.grid_scores_

# Generate Test Result

In [None]:
df_test = pd.read_csv('processed_test.csv')

In [None]:
X_test = df_test.drop('id',1).as_matrix()
X_test_scaled = scaler.transform(X_test)

In [None]:
result = NN_model.predict_classes(X_test_scaled)

In [None]:
df_test['stroke'] = result

In [None]:
df_test.head(10)

In [None]:
df_test[['id','stroke']].to_csv('result.csv',index = False)