# AnalyticsToolkit example for binary classification

In [1]:
import os
os.getcwd()

'C:\\Users\\jb2428\\Desktop\\python\\AnalyticsToolkit\\heart_disease'

In [7]:
import pandas as pd
import pickle as pk
from sklearn.cross_validation import train_test_split

In [13]:
%load_ext autoreload
%autoreload 2

In [14]:
import sys
sys.path.append(r'C:\Users\jb2428\Desktop\python\AnalyticsToolkit\analyticstoolkit')
import analytic_toolkit as atk

### Read main data set

In [15]:
# just a single file in this analysis; 
# see aggregate_tables_ex.py for multiple files and more on data preprocessing
df = pd.read_csv(r'data.csv')
df.head(n=3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,OUTCOME
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1


#### print codebook

In [25]:
f_codebook = r'codebook.xlsx'    
pd.read_excel(f_codebook)

Unnamed: 0,Variable Name,File,"Mapping (u=continuous, b=binary, g=categorical)",Predictor,Description,Data Dictionary Document
0,age,data.csv,u,1.0,,
1,sex,data.csv,b,,,
2,cp,data.csv,g,1.0,chest pain type,
3,trestbps,data.csv,u,1.0,resting blood pressure,
4,chol,data.csv,u,1.0,,
5,fbs,data.csv,b,,,
6,restecg,data.csv,g,,,
7,thalach,data.csv,u,1.0,,
8,exang,data.csv,b,1.0,,
9,oldpeak,data.csv,u,1.0,,


#### create reference standard

In [30]:
     
## Get data transformation rules    
include_cols, tr = atk.get_transformation_rules(f_codebook) 
X = df.loc[:,include_cols]
y = df.loc[:,'OUTCOME'] > 0

In [32]:
print(X.head(n=3))
print(y.head(n=3))

    age   cp  trestbps   chol  thalach  exang  oldpeak  slope
0  63.0  1.0     145.0  233.0    150.0    0.0      2.3    3.0
1  67.0  4.0     160.0  286.0    108.0    1.0      1.5    2.0
2  67.0  4.0     120.0  229.0    129.0    1.0      2.6    2.0
0    False
1     True
2     True
Name: OUTCOME, dtype: bool


In [None]:
#### # # REMOVE FREQUENTLY MISSING
# X.dropna(thresh=len(X) * 0.6, inplace=True, axis=1)  # filter out columns with more than 40% missing


## Train/Test Split

In [34]:
# Split Data (Stratified splitting)
train_size = 0.25 # proportion of training samples
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    train_size=train_size, 
                                                    stratify=y, 
                                                    random_state=1)        

project_reference_standard_file = 'project_train_test_split.data'
pk.dump((X_train, X_test, y_train, y_test, tr), open(project_reference_standard_file, 'wb'))  

## Data summary
print('Train count:\n', y_train.value_counts())
print('Test count:\n', y_test.value_counts())

# generate_file_summary(X_train.reset_index(),'project_train_set.xlsx')   # see aggregate_tables.py

Train count:
 False    41
True     34
Name: OUTCOME, dtype: int64
Test count:
 False    123
True     105
Name: OUTCOME, dtype: int64


## Define Meta Models

In [38]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# XGBoost
# DeepNets
             
metamodels = [
             {'id': 'SGDClassifier',
               'model': SGDClassifier,
               'hyperparameters': {     
                     'loss' : ['log'],
                     'penalty' : ['elasticnet'],                                   
                     'alpha' : [1e-2, 1e-1, 1, 1e1, 1e2],
                     'l1_ratio' : [0.25, 0.5, 0.75],
                     'class_weight': ['balanced']}
                     },
             {'id': 'Random Forest',
               'model': RandomForestClassifier,
               'hyperparameters': {  
                     'n_estimators' : [10, 100],
                     'criterion': ['gini', 'entropy'],
                     'class_weight': [None, 'balanced'],
                     'max_depth': [None, 5] }
                     }     
             ]               

## Optimize Hyperparameters

In [39]:
atk.metamodels_cross_validate(X_train, 
                              y_train, 
                              transformation_rules=tr, 
                              metamodels=metamodels, 
                              kfolds=5, 
                              f_validate='metamodels_cross_validate_results.data',
                              verbose=True)

Training SGDClassifier --------------------
Fold  0...............1...............2...............3...............4...............
    auc_mean  auc_percentile_50   auc_std
6   0.500000           0.500000  0.000000
7   0.500000           0.500000  0.000000
8   0.500000           0.500000  0.000000
9   0.500000           0.500000  0.000000
10  0.500000           0.500000  0.000000
11  0.500000           0.500000  0.000000
12  0.500000           0.500000  0.000000
13  0.500000           0.500000  0.000000
14  0.500000           0.500000  0.000000
0   0.803968           0.803571  0.115362
1   0.809722           0.839286  0.114899
2   0.809921           0.821429  0.112017
5   0.819643           0.821429  0.120374
4   0.821825           0.821429  0.113567
3   0.829365           0.821429  0.097501
Best: {'alpha': 0.1, 'class_weight': 'balanced', 'loss': 'log', 'penalty': 'elasticnet', 'l1_ratio': 0.25}
      auc_mean             0.829365
auc_percentile_50    0.821429
auc_std              0.0

## Evaluate Optimal Models on Test Set

In [40]:
atk.fit_optimal_model_to_training_data(X_train, 
                                       y_train, 
                                       X_test, 
                                       y_test, 
                                       f_validate='metamodels_cross_validate_results.data', 
                                       f_fit_models='fit_models.data')

Testing SGDClassifier --------------------
Hyperparameters:
    alpha : 0.1
    class_weight : balanced
    loss : log
    penalty : elasticnet
    l1_ratio : 0.25
>> AUC: 0.847
Testing Random Forest --------------------
Hyperparameters:
    class_weight : None
    max_depth : None
    n_estimators : 10
    criterion : entropy
>> AUC: 0.807


## Write evaluation summaries

In [41]:
atk.summarize_test_results(X_test, 
                           y_test, 
                           f_validate='metamodels_cross_validate_results.data', 
                           f_fit_models='fit_models.data')

Testing SGDClassifier --------------------
Hyperparameters:
    alpha : 0.1
    class_weight : balanced
    loss : log
    penalty : elasticnet
    l1_ratio : 0.25
>> AUC: 0.847
Testing Random Forest --------------------
Hyperparameters:
    class_weight : None
    max_depth : None
    n_estimators : 10
    criterion : entropy
>> AUC: 0.807
