In [2]:
import pandas as pd
thedata = pd.read_csv("healthdata.csv")

In [8]:
reqdata = thedata.drop('Unnamed: 0', axis=1)

In [24]:
reqdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3278 entries, 0 to 3277
Data columns (total 12 columns):
gender              3278 non-null int64
income              3278 non-null object
education           3278 non-null object
smoker              3278 non-null int64
height              3278 non-null float64
weight              3278 non-null float64
diabetes            3278 non-null int64
heartrate           3278 non-null int64
parents.divorced    3278 non-null int64
cholesterol         3278 non-null float64
high.bp             3278 non-null int64
mile.time           3278 non-null float64
dtypes: float64(4), int64(6), object(2)
memory usage: 307.4+ KB


In [29]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# numeric transformer = standard scaler
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# categorical transformer = one hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [33]:
# differentiate features between numeric and categorical
numeric_features = reqdata.select_dtypes(include=['int64', 'float64']).columns
categorical_features = reqdata.select_dtypes(include=['object']).columns

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

numeric_features, categorical_features

(Index(['gender', 'smoker', 'height', 'weight', 'diabetes', 'heartrate',
        'parents.divorced', 'cholesterol', 'high.bp', 'mile.time'],
       dtype='object'), Index(['income', 'education'], dtype='object'))

In [31]:
# create the pipeline that combines the preprocessor with a classifier
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
                    ('classifier', RandomForestClassifier())])

In [41]:
reqsdata = preprocessor.fit_transform(reqdata)
reqsdata.shape, reqdata.shape

((3278, 18), (3278, 12))

In [45]:
reqdata.iloc[:5, :]

Unnamed: 0,gender,income,education,smoker,height,weight,diabetes,heartrate,parents.divorced,cholesterol,high.bp,mile.time
0,0,">$115,000",undergraduate,0,67.874013,172.491886,0,78,1,141.894491,0,748.935877
1,1,"$90,000-$114,999",undergraduate,0,68.818911,203.118004,0,78,1,143.347814,0,644.69054
2,1,"<$40,000",high school,1,63.273148,189.851102,1,77,1,239.342819,1,651.189118
3,1,"$40,000-$64,999",undergraduate,0,66.665357,184.494927,0,71,0,164.545092,0,621.97864
4,1,"$40,000-$64,999",undergraduate,0,70.236517,200.678982,0,73,1,188.914927,0,640.046249


In [46]:
reqdata['income'].value_counts()

$40,000-$64,999     706
$90,000-$114,999    669
>$115,000           645
<$40,000            635
$65,000-$89,999     623
Name: income, dtype: int64

In [47]:
reqdata['education'].value_counts()

undergraduate    1141
graduate         1121
high school      1016
Name: education, dtype: int64

In [70]:
import numpy as np
X = reqsdata[:, 1:]
y = np.ceil(reqsdata[:, 0])

X.shape, y.shape

((3278, 17), (3278,))

In [71]:
# split preporcessed data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2622, 17), (656, 17), (2622,), (656,))

In [72]:
# train the random forest classifier
rfx = RandomForestClassifier()
rfx.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [84]:
# test the random forest classifier
rfx_test = rfx.predict(X_test)

In [85]:
# custom built metrics
sse = 0
ec = 0

for pred, expect in zip(rfx_test, y_test):
    ce = np.sqrt((pred - expect)**2)
    sse += ce
    if ce > 0:
        ec += 1
        print("Mis prediction: ", pred, expect)
print("Total error: ", sse)
print("No. of errors: ", ec)
print("Accuracy: ", ((y_test.shape[0] - ec) / y_test.shape[0]))

Mis prediction:  -1.0 1.0
Mis prediction:  -1.0 1.0
Mis prediction:  -1.0 1.0
Mis prediction:  -1.0 1.0
Mis prediction:  -1.0 1.0
Mis prediction:  -1.0 1.0
Mis prediction:  1.0 -1.0
Mis prediction:  1.0 -1.0
Mis prediction:  1.0 -1.0
Mis prediction:  -1.0 1.0
Mis prediction:  -1.0 1.0
Total error:  22.0
No. of errors:  11
Accuracy:  98.32317073170732


In [96]:
# using scikit learn metrics to measure model accuracy
from sklearn.metrics import accuracy_score, log_loss
import math

sci_accur = accuracy_score(rfx_test, y_test)
sci_logl = log_loss(y_test, rfx_test)
sci_accur, sci_logl, math.exp(-sci_logl), y_test.shape

(0.9832317073170732, 0.5791599681955926, 0.5603688966029817, (656,))

In [111]:
# testing log loss
rander = np.arange(-1.0, 1.0, 1/y_test.shape[0]*2).reshape(y_test.shape)
rander_logl = log_loss(y_test, rander)
rander.shape, rander_logl, y_test.shape, rfx_test.shape

((656,), 9.343655774847901, (656,), (656,))

In [114]:
dotpro = np.dot(y_test, rfx_test)
mody = np.linalg.norm(y_test)
modrfx = np.linalg.norm(rfx_test)
cosi = dotpro / (mody * modrfx)
cosi

0.9664634146341463

In [129]:
# testing model parameters with grid search
param_grid = { 
    'n_estimators': [20, 50],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2,3,4,5,6],
    'criterion' :['gini', 'entropy']}

from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(rfx, param_grid, n_jobs=4)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(CV.best_score_)



{'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 20}
0.9916094584286804


In [119]:
RandomForestClassifier.get_params(rfx).keys()

dict_keys(['bootstrap', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [120]:
RandomForestClassifier.get_params(rfx).values()

dict_values([True, None, 'gini', None, 'auto', None, 0.0, None, 1, 2, 0.0, 10, None, False, None, 0, False])

In [126]:
# creating the "best model" and testing it
rfx2 = RandomForestClassifier(criterion='entropy', max_depth=6, max_features='auto', n_estimators=50)
rfx2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [128]:
b_pred = rfx2.predict(X_test)
b_accu = accuracy_score(b_pred, y_test)
b_logl = log_loss(b_pred, y_test)
b_accu, b_logl

(0.9878048780487805, 0.4212119035729684)