In [123]:
import numpy as np
import pandas as pd
import pandas_profiling
import math
import re
import sklearn
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.ensemble import *
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *
from spellchecker import SpellChecker
from word2number import w2n
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from datetime import datetime

In [2]:
def is_number(s):
    """ Returns True is string is a number. """
    try:
        float(s)
        return True
    except ValueError:
        return False

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train["stroke_in_2018"]=train["stroke_in_2018"].apply(lambda x: x if x in ["0","1"] else None)

In [4]:
test.shape

(8718, 13)

### Merge Duplicate data in train

In [5]:
def merge(grp):
    df = pd.DataFrame()
    if(grp.shape[0] > 1):
        for c in grp.columns:
            value_counts = grp[c].value_counts().index.astype(grp[c].dtypes)
            if value_counts.size > 1:
                print(grp, value_counts) #Error
            elif value_counts.size == 1:
                df[c] = value_counts[0]
            else:
                df[c] = None
    else:
        df = grp.head(1)
    return df
train = train.groupby("id").apply(merge)

In [6]:
train = train[pd.notnull(train['stroke_in_2018'])] # Remove if target is NULL
y_train = train["stroke_in_2018"]
print("Training ",train.shape)
spell = SpellChecker()

Training  (34595, 14)


In [7]:
ntrain = y_train.shape[0]
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['stroke_in_2018'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

all_data size is : (43313, 13)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [9]:
print(ntrain)

34595


### Data Cleaning

In [10]:
def is_job_status_and_living_area_reversed(x):
    return ((x["job_status"] != None and x["job_status"] in ("r", "c", "city", "remote", "remotee"))
             or (x["living_area"] != None and x["living_area"] in ("private_sector", "business_owner")))
def process_job_status(x):
    if x == None or x in ("nan", 'null', "", 'n.a'):
        return None
    elif x in ("private sector", "privattte", "private", "private_sector"):
        return "private_sector"
    elif x in ("government", "govt."):
        return "government"
    elif x in ("business_owner", "business owner", "biz"):
        return "business_owner"
    elif x in ("parental_leave", "parental leave"):
        return "parental_leave"
    else:
        return x
    
def process_living_area(x):
    if x == None or x in ("nan", 'null', "", 'n.a'):
        return None
    elif x == 'c':
        return 'city'
    elif x in ('r', 'remotee'):
        return 'remote'
    else:
        return x

def split_job_status_and_living_area(x):
    pair = x.lower().split("?") if x != None else [x, x]
    if len(pair) < 2:
        pair = [pair[0], None]
    return pair

def process_job_status_and_living_area(df):
    df["job_status"] = df["job_status and living_area"].astype(str).apply(split_job_status_and_living_area).apply(lambda x: x[0])
    df["living_area"] = df["job_status and living_area"].astype(str).apply(split_job_status_and_living_area).apply(lambda x: x[1])
    job_status = df.apply(lambda x: x["living_area"] if is_job_status_and_living_area_reversed(x) else x["job_status"], 1)
    living_area = df.apply(lambda x: x["job_status"] if is_job_status_and_living_area_reversed(x) else x["living_area"], 1)
    df["job_status"] = job_status.apply(lambda x: process_job_status(x))
    df["living_area"] = living_area.apply(lambda x: process_living_area(x))
    df.drop(columns='job_status and living_area',inplace=True)

In [11]:
process_job_status_and_living_area(all_data)
#process_job_status_and_living_area(test)

In [12]:
def process_smoker_status(x):
    if x == None:
        return None
    elif x.startswith("non"):
        return "non-smoker"
    elif x.startswith("quit"):
        return "quit"
    elif x.startswith("active"):
        return "active_smoker"
    else:
        return None
all_data["smoker_status"] = all_data["smoker_status"].astype(str).apply(process_smoker_status)
#test["smoker_status"] = test["smoker_status"].astype(str).apply(process_smoker_status)

In [13]:
def process_binary_col(df,columns):
    for col in columns:
        df[col] = pd.to_numeric(df[col],errors="coerce")
        df[col] = df[col].astype(int,errors='ignore')
        df[col] = df[col].apply(lambda x: x if x in [0,1] else None)

In [14]:
process_binary_col(all_data,["heart_condition_detected_2017","high_BP","married"])

Convert BMI to numeric

In [15]:
all_data["BMI"] = pd.to_numeric(all_data["BMI"],errors="coerce")

In [16]:
# Process gender into oneof "F", "M" and "OTHER"
def genderSpellingRewrite(gender_str):
    if not isinstance(gender_str, str):
        return None;
    uppered = gender_str.upper()
    # Repeated single occurence should be truncate.
    patternM = re.compile('[M]+$')
    if (patternM.match(uppered)):
        return "M"
    patternF = re.compile('[F]+$')
    if (patternF.match(uppered)):
        return "F"
    # Misspelling should be corrected and replaced.
    # TODO: Malle is not going to be corrected as Male.Need to update spell's known list.
    corrected = spell.correction(uppered).upper()
    if (corrected == "FEMALE"):
        return "F"
    if (corrected == "MALE"):
        return "M"
    if (corrected == "OTHER"):
        return "OTHER"
    return None

In [17]:
# Process human number word into number
def numberConversion(potential_number_word):
    # Correct any possible miss spelled number_word
    corrected_potential_word = spell.correction(potential_number_word)
    # check it it means number
    try:
      potential_num = w2n.word_to_num(corrected_potential_word)
    except ValueError:
        return potential_number_word
    return potential_num

In [18]:
def formatSexAge(origin_str):
    if not isinstance(origin_str, str):
        return [None,None]
    # Preprocess 
    # Entry with missing column.
    if (',' not in origin_str):
        origin_str = origin_str + ',';
    origin_list = origin_str.replace(" ", "").upper().split(",")
    if(origin_list[0].upper() == "NAN"):
        origin_list[0] = ""
    if(origin_list[1].upper() == "NAN"):
        origin_list[1] = ""
    # Convert possible number in entry.
    if((not is_number(origin_list[0])) and (not is_number(origin_list[1]))):
        origin_list[0] = numberConversion(origin_list[0])
        origin_list[1] = numberConversion(origin_list[1])
    genderSet = set(['F', 'M', 'OTHER'])
    if (is_number(origin_list[0])):
        # wrong entry (num, num)
        if (is_number(origin_list[1])):
            if (origin_list[0] == origin_list[1]):
                return [None, origin_list[0]]
            return [None, None]
        else: # first number, second '' or gender (NOT num for sure)
          # swap back number
          origin_list = origin_list[::-1]
          origin_list[0] = genderSpellingRewrite(origin_list[0])
          return origin_list
    else: 
        origin_list[0] = genderSpellingRewrite(origin_list[0])
         # first '' or str, second is number
        if (is_number(origin_list[1])):
            return origin_list
        else:
            origin_list[1] = genderSpellingRewrite(origin_list[1])
            if(origin_list[0] == origin_list[1]):
               origin_list[1] = None
            return origin_list

In [19]:
def process_sex_age_(df):
    df["sex_age_list"] = df["sex and age"].astype(str).apply(lambda x: formatSexAge(x))
    df[['sex','age']] = pd.DataFrame(df["sex_age_list"].values.tolist(), index= df.index)
    df[df["sex"] == "None"]["sex"] = "OTHER"
process_sex_age_(all_data)

Clean Sex and Age

In [20]:
all_data["age"] = pd.to_numeric(all_data["age"],errors="coerce")

In [21]:
def split_train_test(df):
    print("Train shape ",df[:ntrain].shape)
    print("Test shape ",df[ntrain:].shape)
    return df[:ntrain], df[ntrain:]

In [22]:
preprocessed_train, preprocessed_test = split_train_test(all_data)

Train shape  (34595, 17)
Test shape  (8718, 17)


In [23]:
def convert_to_categorical(df, columns):
    for col in columns:
        df[col] = df[col].astype(str)

### Missing Values

In [24]:
def impute_by_mode(df,columns):
    for col in columns:
        df[col] = df[col].fillna(df[col].mode().iloc[0])
def impute_by_median(df,columns):
    for col in columns:
        df[col] = df[col].fillna(df[col].median())

In [25]:
impute_by_mode(all_data,["heart_condition_detected_2017","high_BP","married","job_status"])
impute_by_median(all_data,["average_blood_sugar"])

In [26]:
all_data["sex"] = all_data["sex"].fillna("OTHER")

Impute Age by Median group by sex

In [27]:
df_tmp = preprocessed_train.groupby("sex")["age"].median().reset_index(name="MedianAge")
df_merge = all_data.merge(df_tmp,on="sex",how="left")
cond = df_merge['age'].isnull()
df_merge['age'] = df_merge['age'].fillna(df_merge["MedianAge"])
all_data = df_merge.drop(columns="MedianAge")

Check missing value of smoker_status

In [28]:
convert_to_categorical(all_data,["heart_condition_detected_2017","married","high_BP","job_status",
                                 "sex","living_area"])
preprocessed_train, preprocessed_test = split_train_test(all_data)

Train shape  (34595, 17)
Test shape  (8718, 17)


In [29]:
def predict_missing(df,missing_var,independent_var,model):
    selected_cols = independent_var+[missing_var]
    non_missing_data = df[df[missing_var].notnull()][selected_cols]
    # Remove missing values
    non_missing_data = non_missing_data.dropna()
    print("Training data for missing value ",non_missing_data.shape)
    
    # Build Random Forest classifier
    clf = make_pipeline(OneHotEncoder(handle_unknown="ignore"),model)
    clf.fit(non_missing_data[independent_var],non_missing_data[missing_var])
    acc = np.mean(cross_val_score(clf,non_missing_data[independent_var],non_missing_data[missing_var],cv=5))
    print("Random Forest Mean Accuracy for 5 runs of cross validation ", acc)
    
    # Predict missing values
    cond = df[missing_var].isnull()
    df[cond][missing_var] = clf.predict(df[cond][independent_var])
    return clf

In [30]:
clf = predict_missing(all_data,"smoker_status",["age","sex","married","high_BP","living_area","average_blood_sugar"],
                        RandomForestClassifier(n_estimators=600, max_depth=7))
# clf = rf_predict_missing(all_data,"",["BMI","age","sex","high_BP","married"])

Training data for missing value  (30045, 7)
Random Forest Mean Accuracy for 5 runs of cross validation  0.5331336398924217


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [104]:
#impute_by_median(all_data,["BMI"])

In [31]:
def bmi_category(bmi):
    if pd.isnull(bmi):
        return np.nan
    if bmi > 35:
        return "SO"
    elif bmi > 30:
        return "MO"
    elif bmi > 25:
        return "O"
    elif bmi > 18.5:
        return "N"
    else:
        return "U"

all_data["bmi_category"] = all_data["BMI"].apply(bmi_category)
predict_missing(all_data,"bmi_category", ["job_status", "high_BP", "heart_condition_detected_2017",
                                             "married", "age", "sex", "average_blood_sugar"],AdaBoostClassifier())

Training data for missing value  (41847, 8)
Random Forest Mean Accuracy for 5 runs of cross validation  0.3804090223181906


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Pipeline(memory=None,
         steps=[('onehotencoder',
                 OneHotEncoder(categorical_features=None, categories=None,
                               drop=None, dtype=<class 'numpy.float64'>,
                               handle_unknown='ignore', n_values=None,
                               sparse=True)),
                ('adaboostclassifier',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=50,
                                    random_state=None))],
         verbose=False)

In [32]:
selected_vars = ["sex","age","high_BP","heart_condition_detected_2017",
                "married","job_status","living_area","average_blood_sugar",
                "bmi_category","smoker_status"]
selected_data = all_data[selected_vars]
selected_data = pd.get_dummies(selected_data).reset_index(drop=True)

In [33]:
X_train = selected_data[:ntrain]
X_test = selected_data[ntrain:]
print("Training data ",X_train.shape, " target ",y_train.shape)
print("Testing data", X_test.shape)

Training data  (34595, 27)  target  (34595,)
Testing data (8718, 27)


Explore Data Complexity

In [34]:
def plot2D(X,y):
    df_tsne = pd.DataFrame()
    df_tsne['stroke'] = y
    tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=3000)
    tsne_results = tsne.fit_transform(X)
    df_tsne['tsne_v1'] = tsne_results[:,0]
    df_tsne['tsne_v2'] = tsne_results[:,1]
    # Make the plot
    sns.scatterplot(
        x="tsne_v1", y="tsne_v2",
        hue="stroke",
        palette=sns.hls_palette(2, l=.3, s=.9),
        data=df_tsne,
        legend="full",
        alpha=0.3
    )

In [37]:
profile = all_data.profile_report(title='Medical Record Profiling Report')
profile.to_file(output_file="train_data_summary.html")

In [38]:
y_train.value_counts()

0    33947
1      648
Name: stroke_in_2018, dtype: int64

# MODEL

In [167]:
from sklearn.svm import SVC
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import ClusterCentroids
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier
from sklearn.metrics import roc_auc_score
#plot_2d_space(X_cc, y_cc, 'Cluster Centroids under-sampling')

In [40]:
model = BalancedRandomForestClassifier()

In [42]:
model.fit(X_train, y_train)

BalancedRandomForestClassifier(bootstrap=True, class_weight=None,
                               criterion='gini', max_depth=None,
                               max_features='auto', max_leaf_nodes=None,
                               min_impurity_decrease=0.0, min_samples_leaf=2,
                               min_samples_split=2,
                               min_weight_fraction_leaf=0.0, n_estimators=100,
                               n_jobs=1, oob_score=False, random_state=None,
                               replacement=False, sampling_strategy='auto',
                               verbose=0, warm_start=False)

In [61]:
roc_auc_score(y_train, model.predict_proba(X_train)[:,0])

0.0722314686619338

In [60]:
model.predict_proba(X_train)

array([0.99144791, 0.42049333, 0.65002494, ..., 0.43112784, 0.57973399,
       0.34024432])

In [111]:
cc = ClusterCentroids(ratio={"0": 4000})
X_cc, y_cc = cc.fit_sample(X_train, y_train)

In [115]:
y_cc = y_cc.astype(int)

In [168]:
print('START ML', datetime.now(), )
kfolds = KFold(n_splits=5, shuffle=True, random_state=42)

def grid_search(X_train, y_train, model, param_grid, scoring=["roc_auc"], refit="roc_auc"):
    gs = GridSearchCV(model,param_grid=param_grid,scoring=scoring,refit=refit,cv=kfolds, n_jobs = -1)
    gs.fit(X_train,y_train)
    results = gs.cv_results_
    print(refit," =",gs.best_score_," achieved by configuration : ",gs.best_params_)
    best_idx = np.argwhere(results['rank_test_%s' % refit] == 1)[0,0]
    for scorer in scoring:
        for sample in ['test']:
            sample_score_mean = results['mean_%s_%s' % (sample, scorer)][best_idx]
            print(sample,"_",scorer,":",sample_score_mean)
    return gs

START ML 2019-07-21 07:56:20.072139


In [169]:
model = RUSBoostClassifier()
best_rf = grid_search(X_train, y_train, model, param_grid = {"n_estimators": [100, 200, 500], "replacement": [True, False], "sampling_strategy": ["not majority", "not minority"]})

roc_auc  = 0.8560527105082113  achieved by configuration :  {'n_estimators': 100, 'replacement': False, 'sampling_strategy': 'not majority'}
test _ roc_auc : 0.8560527105082113


In [176]:
y_rf = best_rf.predict_proba(X_test)[:,1]

In [177]:
pd.Series(y_rf).astype(float).value_counts()

0.395174    100
0.394781     75
0.394796     70
0.394404     63
0.394518     42
0.395390     40
0.394526     34
0.394140     34
0.486059     32
0.394904     31
0.397287     31
0.395153     26
0.395012     26
0.485649     26
0.486748     25
0.395917     25
0.486338     25
0.397666     23
0.485971     23
0.487143     23
0.395172     23
0.396295     22
0.394775     22
0.396894     21
0.397273     21
0.486381     20
0.486608     20
0.395550     20
0.486776     19
0.472650     19
           ... 
0.494157      1
0.495334      1
0.473622      1
0.491945      1
0.490656      1
0.481216      1
0.491187      1
0.483377      1
0.492880      1
0.492861      1
0.489489      1
0.394791      1
0.472413      1
0.491138      1
0.483071      1
0.492622      1
0.492027      1
0.486011      1
0.486874      1
0.492177      1
0.488449      1
0.480642      1
0.493410      1
0.495465      1
0.491383      1
0.488434      1
0.489630      1
0.472735      1
0.490461      1
0.497856      1
Length: 4718, dtype: int

In [178]:
df_submit = pd.DataFrame()
df_submit["id"] = test["id"]
df_submit["stroke_in_2018"] = pd.Series(y_rf).astype(float)
df_submit.to_csv("submission.csv",index=False)

In [82]:
model = EasyEnsembleClassifier(n_estimators = 50)
best_ensemble = grid_search(X_train, y_train, model, param_grid = {"base_estimator" : [BaggingClassifier(), AdaBoostClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), GradientBoostingClassifier(loss = "exponential"), RandomForestClassifier()] })

roc_auc  = 0.8565332515066911  achieved by configuration :  {'base_estimator': GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)}
test _ roc_auc : 0.8565332515066911


In [87]:
y_ensemble = best_ensemble.predict(X_test)
df_submit = pd.DataFrame()
df_submit["id"] = test["id"]
df_submit["stroke_in_2018"] = pd.Series(y_ensemble)
df_submit.to_csv("submission.csv",index=False)