# DATA MODELING ML PIPELINE

#### Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

#### Read & Store Data

In [2]:
df_clean = pd.read_csv('DiabetesCleaned_2018.csv', index_col=0) # Read csv file

#### Copy Data

In [3]:
df_model = df_clean.copy()

#### Sample Data

In [4]:
all_ones  = df_clean.loc[df_model['DIABETES'] == 1]
all_zeros = df_clean.loc[df_model['DIABETES'] == 0]

print("{} is the length of all_ones".format(len(all_ones)))
print("{} is the length of all_zeros".format(len(all_zeros)))

52851 is the length of all_ones
336717 is the length of all_zeros


In [5]:
sampled_ones = all_ones.sample(n=30000)
samples_zeros = all_zeros.sample(n=30000)

print("{} is the length of sampled_ones".format(len(sampled_ones)))
print("{} is the length of samples_zeros".format(len(samples_zeros)))

30000 is the length of sampled_ones
30000 is the length of samples_zeros


In [6]:
df_model = pd.concat((sampled_ones, samples_zeros), axis=0)
df_model = df_model.sample(frac=1)

In [7]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60000 entries, 283235 to 133961
Data columns (total 19 columns):
EXERCISE                60000 non-null float64
HEARTATTACK             60000 non-null float64
CORONARYHEARTDISEASE    60000 non-null float64
STROKE                  60000 non-null float64
ASTHMA                  60000 non-null float64
SKINCANCER              60000 non-null float64
OTHERCANCER             60000 non-null float64
CHRONICBRONCHITIS       60000 non-null float64
ARTHRITIS               60000 non-null float64
DEPRESSIVEDISORDER      60000 non-null float64
KIDNEYDISEASE           60000 non-null float64
DIABETES                60000 non-null float64
SLEEPTIME_GROUP         60000 non-null object
SEX_GROUP               60000 non-null object
WEIGHT_KILOGRAM         60000 non-null float64
HEIGHT_METER            60000 non-null float64
BMI_GROUP               60000 non-null object
RACE_GROUP              60000 non-null object
AGE_GROUP               60000 non-null obje

#### Split Data (Independent Variables & Depedent Varaibles)

In [8]:
dependentVar = 'DIABETES'

X = df_model.loc[:, df_model.columns != dependentVar]
y = df_model[dependentVar].values

print("Number of observations and dimensions in 'X':", X.shape)
print("Number of observations in 'y':", y.shape)

Number of observations and dimensions in 'X': (60000, 18)
Number of observations in 'y': (60000,)


In [9]:
print(X, '\n')
print(y)

        EXERCISE  HEARTATTACK  CORONARYHEARTDISEASE  STROKE  ASTHMA  \
283235       0.0          0.0                   0.0     0.0     0.0   
265051       0.0          1.0                   0.0     0.0     1.0   
374463       0.0          0.0                   0.0     0.0     0.0   
156849       1.0          0.0                   0.0     0.0     0.0   
32830        1.0          0.0                   1.0     0.0     0.0   
...          ...          ...                   ...     ...     ...   
143113       1.0          0.0                   0.0     0.0     0.0   
399750       1.0          0.0                   0.0     0.0     0.0   
250087       1.0          0.0                   0.0     0.0     0.0   
414675       1.0          0.0                   0.0     0.0     0.0   
133961       1.0          1.0                   1.0     0.0     0.0   

        SKINCANCER  OTHERCANCER  CHRONICBRONCHITIS  ARTHRITIS  \
283235         0.0          1.0                0.0        0.0   
265051         0.

#### Split Data (Train & Test)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

print("Number of observations and dimensions in training set:", X_train.shape)
print("Number of observations and dimensions in test set:", X_test.shape)
print("Number of observations in training set:", y_train.shape)
print("Number of observations in test set:", y_test.shape)

Number of observations and dimensions in training set: (45000, 18)
Number of observations and dimensions in test set: (15000, 18)
Number of observations in training set: (45000,)
Number of observations in test set: (15000,)


#### Transform Data

In [11]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

In [12]:
categorical_features = ['SLEEPTIME_GROUP', 'SEX_GROUP', 'BMI_GROUP', 'RACE_GROUP', 'AGE_GROUP']
numerical_features   = [col for col in X.columns if col not in categorical_features]
preprocessor         = ColumnTransformer(
                        transformers=[('onehot-encoder', OneHotEncoder(), categorical_features),
                                     ('scalar', StandardScaler(), numerical_features)])

#### Pipeline Creation

In [13]:
from sklearn.linear_model import LogisticRegression
pipeline_lr = Pipeline([('preprocessor', preprocessor),
                        ('pca', PCA(n_components=2)),
                        ('lr_classifier', LogisticRegression(class_weight='balanced', random_state=1234))])

In [14]:
from sklearn.neighbors import KNeighborsClassifier
pipeline_knn = Pipeline([('preprocessor', preprocessor),
                        ('pca', PCA(n_components=2)),
                        ('knn_classifier', KNeighborsClassifier())])

In [15]:
from sklearn.svm import SVC
pipeline_svc = Pipeline([('preprocessor', preprocessor),
                        ('pca', PCA(n_components=2)),
                        ('svc_classifier', SVC(class_weight='balanced', random_state=1234))])

In [16]:
from sklearn.naive_bayes import GaussianNB
pipeline_nb = Pipeline([('preprocessor', preprocessor),
                        ('pca', PCA(n_components=2)),
                        ('nb_classifier', GaussianNB())])

In [17]:
from sklearn.tree import DecisionTreeClassifier
pipeline_dt = Pipeline([('preprocessor', preprocessor),
                     ('pca', PCA(n_components=2)),
                     ('dt_classifier', DecisionTreeClassifier(class_weight='balanced', random_state=1234))])

In [18]:
from sklearn.ensemble import RandomForestClassifier
pipeline_rf = Pipeline([('preprocessor', preprocessor),
                     ('pca', PCA(n_components=2)),
                     ('rf_classifier', RandomForestClassifier(class_weight='balanced', random_state=1234))])

In [19]:
from xgboost import XGBClassifier
pipeline_xgb = Pipeline([('preprocessor', preprocessor),
                     ('pca', PCA(n_components=2)),
                     ('xgb_classifier', XGBClassifier(class_weight='balanced', random_state=1234))])

In [20]:
from sklearn.ensemble import AdaBoostClassifier
pipeline_adb = Pipeline([('preprocessor', preprocessor),
                     ('pca', PCA(n_components=2)),
                     ('adb_classifier', AdaBoostClassifier(random_state=1234))])

#### Build Pipeline on Training Data

In [21]:
pipelines = [pipeline_lr, pipeline_knn, pipeline_svc, pipeline_nb, pipeline_dt, pipeline_rf, pipeline_xgb, 
             pipeline_adb]

In [22]:
pipe_dict = {0: 'Logistic Regression', 1: 'K-Nearest Neighbors', 2: 'Kernal Support Vector Machine',  
             3: 'Naive Bayes', 4: 'Decision Tree', 5: 'Random Forest', 6: 'XGBoost', 7: 'AdaBoost'}

In [23]:
for pipe in pipelines:
	pipe.fit(X_train, y_train)



#### Predict Pipeline on Testing Data

In [24]:
for i,model in enumerate(pipelines):
    print("{0:s} Test Accuracy: {1:.3f}".format(pipe_dict[i], model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.666
K-Nearest Neighbors Test Accuracy: 0.632
Kernal Support Vector Machine Test Accuracy: 0.667
Naive Bayes Test Accuracy: 0.644
Decision Tree Test Accuracy: 0.600
Random Forest Test Accuracy: 0.623
XGBoost Test Accuracy: 0.676
AdaBoost Test Accuracy: 0.673


In [25]:
best_accuracy   = 0
best_classifier = 0
best_pipeline   = ""

for i,model in enumerate(pipelines):
    if model.score(X_test,y_test) > best_accuracy:
        best_accuracy = model.score(X_test,y_test)
        best_pipeline = model
        best_classifier = i
print('Classifier with best accuracy: {}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy: XGBoost


#### Hyperparameter Tuning

In [26]:
param_grid_lr = {
    'lr_classifier__max_iter': [100, 200, 500, 1000],                              # Number of iterations
    'lr_classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],   # Algorithm to use for optimization
    'lr_classifier__class_weight': ['balanced']                                    # Troubleshoot unbalanced data sampling
}

gdLR = GridSearchCV(estimator=pipeline_lr, param_grid=param_grid_lr, cv=3, n_jobs=-1, verbose=True)

In [27]:
param_grid_knn = {
    'knn_classifier__n_neighbors': [5, 10, 15],                                      # Number of K
    #'knn_classifier__algorithm': ['ball_tree', 'kd_tree', 'brute', 'auto'],         # Algorithm to compute nearest neighbors
    'knn_classifier__metric': ['minkowski', 'euclidean', 'manhattan', 'chebyshev']   # Algorithm to find the distance
}

gdKNN = GridSearchCV(estimator=pipeline_knn, param_grid=param_grid_knn, cv=3, n_jobs=-1, verbose=True)

In [28]:
param_grid_svc = {
    'svc_classifier__C': [0.1, 1, 10, 100],                           # Regularization parameter
    'svc_classifier__gamma': [1, 0.1, 0.01, 0.001],                   # Kernel coef for ‘rbf’, ‘poly’ and ‘sigmoid’
    #'svc_classifier__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel type to be used in the algorithm
    #'svc_classifier__degree': [1, 2, 3, 4, 5, 6],                    # Degree for ‘poly’
    'svc_classifier__class_weight': ['balanced']                      # Troubleshoot unbalanced data sampling
}

gdSVC = GridSearchCV(estimator=pipeline_svc, param_grid=param_grid_svc, cv=2, n_jobs=-1, verbose=True)

In [29]:
param_grid_dt = {
    'dt_classifier__criterion': ['gini', 'entropy'],   # Measure quality of split
    'dt_classifier__splitter': ['best', 'random'],     # Strategy used to choose split at each node
    'dt_classifier__max_depth': [2, 4, 6, 8, 10],      # Maximum depth of the tree
    'dt_classifier__class_weight': ['balanced']        # Troubleshoot unbalanced data sampling
}

gdDT = GridSearchCV(estimator=pipeline_dt, param_grid=param_grid_dt, cv=3, n_jobs=-1, verbose=True)

In [30]:
param_grid_rf = {
    'rf_classifier__criterion': ['gini', 'entropy'],        # Measure quality of split
    'rf_classifier__max_depth': [2, 3, 5, 9],               # Maximum depth of the tree
    'rf_classifier__n_estimators': [100, 200, 300, 1000],   # Number of trees in the forest
    'rf_classifier__class_weight': ['balanced']             # Troubleshoot unbalanced data sampling
}

gdRF = GridSearchCV(estimator=pipeline_rf, param_grid=param_grid_rf, cv=3, n_jobs=-1, verbose=True)

In [31]:
param_grid_xgb = {
    'xgb_classifier__max_depth': [2, 3, 5, 9],                   # Maximum depth of a tree
    'xgb_classifier__n_estimators': [50, 100, 200, 300, 1000],   # Maximum number of estimators at which boosting is terminated
    'xgb_classifier__learning_rate': [1, 0.1, 0.01, 0.001],      # Step size shrinkage used in update to prevents overfitting range: [0,1]
    'xgb_classifier__class_weight': ['balanced']                 # Troubleshoot unbalanced data sampling
}

gdXGB = GridSearchCV(estimator=pipeline_xgb, param_grid=param_grid_xgb, cv=3, n_jobs=-1, verbose=True)

In [32]:
param_grid_adb = {
    'adb_classifier__n_estimators': [50, 100, 200, 300, 1000],   # Maximum number of estimators at which boosting is terminated
    'adb_classifier__learning_rate': [1, 0.1, 0.01, 0.001]       # Step size shrinkage used in update to prevents overfitting range: [0,1]   
}

gdADB = GridSearchCV(estimator=pipeline_adb, param_grid=param_grid_adb, cv=3, n_jobs=-1, verbose=True)

#### Build Pipeline Grid Search on Training Data

In [33]:
pipelines_grid = [gdLR, gdKNN, gdSVC, gdDT, gdRF, gdXGB, gdADB]

In [34]:
pipe_grid_dict = {0: 'Logistic Regression', 1: 'K-Nearest Neighbors', 2: 'Kernal Support Vector Machine', 
                  3: 'Decision Tree', 4: 'Random Forest', 5: 'XGBoost', 6: 'AdaBoost'}

In [35]:
for pipe in pipelines_grid:
	pipe.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.0s finished


Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    5.7s finished


Fitting 2 folds for each of 16 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  5.1min finished


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.5s finished


Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.8s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:  2.4min finished


Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  5.4min finished


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.2min finished


#### Predict Pipeline Grid Search on Testing Data

In [36]:
for i,model in enumerate(pipelines_grid):
    print("{0:s} Test Accuracy: {1:.3f}".format(pipe_grid_dict[i], model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.666
K-Nearest Neighbors Test Accuracy: 0.657
Kernal Support Vector Machine Test Accuracy: 0.670
Decision Tree Test Accuracy: 0.674
Random Forest Test Accuracy: 0.676
XGBoost Test Accuracy: 0.676
AdaBoost Test Accuracy: 0.674


In [37]:
best_accuracy   = 0
best_classifier = 0
best_pipeline   = ""

for i,model in enumerate(pipelines):
    if model.score(X_test,y_test) > best_accuracy:
        best_accuracy = model.score(X_test,y_test)
        best_pipeline = model
        best_classifier = i
print('Classifier with best accuracy: {}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy: XGBoost


##### End of document.