In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.patches as mpatches
import sys
from impyute.imputation.cs import fast_knn
from impyute.imputation.cs import mice
from itertools import product
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelBinarizer
from mlxtend.plotting import plot_decision_regions
from sklearn.feature_selection import f_classif
from sklearn.multiclass import OneVsRestClassifier as ovr

# Tree Visualization
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

# Scores
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# Ensemble models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import seaborn as sns

# Ignoring Errors
import warnings
warnings.simplefilter('ignore')

In [51]:
# def find_val_score(classifier, X, y):
#     scores = cross_val_score(estimator=classifier,
#                          X=X,
#                          y=y,
#                          cv=10,
#                          scoring='f1_sample')
#     return (scores.mean(), scores.std())
scorer = make_scorer(f1_score, pos_label = None, average = 'micro')
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
def find_best(classifier, X_train, y_train, param_grid, scoring=scorer):
    gs = GridSearchCV(estimator=classifier, 
                      param_grid=param_grid, 
                      scoring=scoring,
                      cv=5,
                      n_jobs=-1)
    gs = gs.fit(X_train, y_train)
    return (gs.best_score_, gs.best_params_, gs.best_estimator_)

# Finding roc_curve and Auc score for each classifier
def find_roccurve(classifier,X_train,X_test,y_train,y_test):
    classifier.fit(X_train, y_train)
    y_prob = classifier.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_prob)
    # print('AUC: %.2f' % auc)
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    return (auc, fpr, tpr, thresholds)

In [52]:
def find_val_score(classifier, X, y):
    all_f1_scores = []
#     kf = KFold(10, random_state=0)
    kf = StratifiedKFold(10, random_state=0)
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        
#         f1_balance = (f1_score(y_test, y_pred, pos_label=0)+f1_score(y_test, y_pred, pos_label=1))/2
        all_f1_scores.append(f1_score(y_test, y_pred, pos_label=None, average='micro'))
    
    all_f1_scores = np.ravel(all_f1_scores)
    return (all_f1_scores.mean(), all_f1_scores.std())

In [47]:
def return_f1_scores(X, y, X_train, y_train):
    pipe_lr = Pipeline([['sc', StandardScaler()], ['clf', ovr(LogisticRegression(random_state=0))]])
    pipe_svm = Pipeline([['sc', StandardScaler()], ['clf', ovr(SVC(probability=True))]])
    pipe_knn = Pipeline([['sc', StandardScaler()], ['clf', ovr(KNeighborsClassifier(n_jobs=-1))]])
    pipe_dt = Pipeline([['sc', StandardScaler()], ['clf', ovr(DecisionTreeClassifier(random_state=0))]])

    lr_grid = [{'clf__estimator__C': param_range,
                'clf__estimator__penalty': ['l1','l2']}]

    svm_grid = [{'clf__estimator__C': param_range,
                'clf__estimator__kernel': ['rbf','sigmoid']}]
    
    knn_grid = [{'clf__estimator__n_neighbors': [5, 10, 30, 50]}]
    
    dt_grid = [{'clf__estimator__max_depth': [None, 1, 2, 3]}]

    lr_params = find_best(pipe_lr, X_train, y_train, lr_grid)
    svm_params = find_best(pipe_svm, X_train, y_train, svm_grid)
    knn_params = find_best(pipe_knn, X_train, y_train, knn_grid)
    dt_params = find_best(pipe_dt, X_train, y_train, dt_grid)

    lr_score = find_val_score(lr_params[2], X, y)
    svm_score = find_val_score(svm_params[2], X, y)
    knn_score = find_val_score(knn_params[2], X, y)
    dt_score = find_val_score(dt_params[2], X, y)

    return (lr_score, svm_score, knn_score, dt_score, (lr_params[2], svm_params[2], knn_params[2], dt_params[2]))

In [48]:
def plot_baseline_f1_scores(num, save=False):
    scores = baseline_f1_scores[num]['score']
    name = baseline_f1_scores[num]['name']

    scores = {'score':[scores[0], scores[1], scores[2], scores[3]]}

    class_names = {'score':'Baseline Approach F1-Scores'}

    classifier_names = ['LR', 'SVM', 'KNN', 'DT']

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(5,5))
    clr = ('orange','blue', 'green', 'purple', 'red', 'purple')
    for idx, mthod in zip([i for i in range(1)], scores.keys()):
        ax.bar(classifier_names, [round(i[0]*100,2) for i in scores[mthod]], 
                               align='center', alpha=1.0, color=clr)
        ax.set_yticks([i for i in range(0,110,10)])
        legends = []
        for i in range(len(classifier_names)):
            legends.append(mpatches.Patch(color=clr[i], label='{}: {}'.format(classifier_names[i], 
                                                                              round(scores[mthod][i][0]*100,2))))
        ax.legend(handles=legends,loc='best')
        ax.set_xlabel('Classifiers')
        ax.set_ylabel('F1 Scores')
        ax.title.set_text(class_names[mthod])
    # fig.tight_layout()
    fig.suptitle('Baseline Approach F1-Scores on\n\"{}\" column'.format(name),
                 y=1.03, fontsize=14)
    if save:
        # 'f1_scores_knn_mice'
        fig.savefig(save, dpi=300, bbox_inches='tight')

In [None]:
def return_roc_curves(classifiers, X_train, X_test, y_train, y_test):
    lr_score = find_roccurve(classifiers[0], X_train, X_test, y_train, y_test)
    svm_score = find_roccurve(classifiers[1], X_train, X_test, y_train, y_test)
    knn_score = find_roccurve(classifiers[2], X_train, X_test, y_train, y_test)
    dt_score = find_roccurve(classifiers[3], X_train, X_test, y_train, y_test)
    
    return (lr_score, svm_score, knn_score, dt_score)

In [None]:
def plot_roc_curves(roc_scores, num, save=False):
    scores = roc_scores[num]['score']
    name_ = roc_scores[num]['name']

    scores = {'score':[scores[0], scores[1], scores[2], scores[3]]}
    
    classifier_names = ['LR', 'SVM', 'KNN', 'DT']
    clr = ('orange','blue', 'green', 'purple', 'red', 'purple')
    
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

    for idx, key in zip([0,1], scores.keys()):
        data = scores[key]
        for name, clor, score_idx in zip(classifier_names, clr[1:5], range(4)):
            inf = data[score_idx]
            ax.plot(inf[1], inf[2], color=clor, label='{}: {}'.format(name, round(inf[0],2)))
        ax.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
        ax.legend()
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        
    fig.suptitle('Roc Curves of Baseline (15 Features) Approach on\n\"{}\" column'.format(name_),
                 y=1.02, fontsize=14)
    if save:
        fig.savefig(save, dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
def calculate_error_scores(X_train, X_test, y_train, y_test, regressor):
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    mae_score = mean_absolute_error(y_test, y_pred)
    mse_score = mean_squared_error(y_test, y_pred)
    r2_score_ = r2_score(y_test, y_pred)
    return (mae_score, mse_score, r2_score_)

In [None]:
def return_error_scores(X_train, X_test, y_train, y_test):
    pipe_lr_reg = Pipeline([['sc', StandardScaler()], ['clf', LinearRegression(n_jobs=-1)]])
    pipe_svr_reg = Pipeline([['sc', StandardScaler()], ['clf', SVR()]])
    pipe_knn_reg = Pipeline([['sc', StandardScaler()], ['clf', KNeighborsRegressor(n_jobs=-1)]])
    pipe_dt_reg = Pipeline([['sc', StandardScaler()], ['clf', DecisionTreeRegressor(random_state=0)]])

    svm_grid = [{'clf__C': param_range,
                'clf__kernel': ['rbf','sigmoid']}]
    
    knn_grid = [{'clf__n_neighbors': [5, 10, 30, 50]}]
    
    dt_grid = [{'clf__max_depth': [None, 1, 2, 3]}]

    svr_params = find_best(pipe_svr_reg, X_train, y_train, svm_grid, scoring='neg_mean_absolute_error')
    knn_params = find_best(pipe_knn_reg, X_train, y_train, knn_grid, scoring='neg_mean_absolute_error')
    dt_params = find_best(pipe_dt_reg, X_train, y_train, dt_grid, scoring='neg_mean_absolute_error')

    lr_errors = calculate_error_scores(X_train, X_test, y_train, y_test, pipe_lr_reg)
    svr_errors = calculate_error_scores(X_train, X_test, y_train, y_test, svr_params[2])
    knn_errors = calculate_error_scores(X_train, X_test, y_train, y_test, knn_params[2])
    dt_errors = calculate_error_scores(X_train, X_test, y_train, y_test, dt_params[2])

    return (lr_errors, svr_errors, knn_errors, dt_errors, (pipe_lr_reg, svr_params[2], knn_params[2], dt_params[2]))

In [None]:
def plot_error_scores(error_scores, num, title, save=False):
    scores = error_scores[num]['score']
    name = error_scores[num]['name']
    scores = {'lr':[scores['lr'][0], scores['lr'][1], scores['lr'][2]],
             'svr':[scores['svr'][0], scores['svr'][1], scores['svr'][2]],
             'knn':[scores['knn'][0], scores['knn'][1], scores['knn'][2]],
             'dt':[scores['dt'][0], scores['dt'][1], scores['dt'][2]]}
    
    class_names = {'lr': 'LinearRegression', 'svr': 'SupportVector Regressor',
                   'knn':'KNN Regressor', 'dt':'DecisionTree Regressor'}

    scores_names = ['MAE', 'MSE', 'R2']

    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10,7))
    ax = np.ravel(ax)
    clr = ('orange','blue', 'green', 'purple', 'red', 'purple')
    for idx, mthod in zip([i for i in range(4)], scores.keys()):
        ax[idx].bar(scores_names, [round(i,3) for i in scores[mthod]], 
                               align='center', alpha=1.0, color=clr)
        ax[idx].set_yticks([a for a in range(0,4)])
        legends = []
        for c in range(len(scores_names)):
            legends.append(mpatches.Patch(color=clr[c], label='{}: {}'.format(scores_names[c], 
                                                                              round(scores[mthod][c],3))))
        ax[idx].legend(handles=legends,loc='best')
        ax[idx].set_xlabel('Error Algorithms')
        ax[idx].set_ylabel("Error Algorithms' Scores")
        ax[idx].title.set_text('MAE, MSE & R2 Scores By {}'.format(class_names[mthod]))
    fig.suptitle('Error Scores of {} For \"{}\" column'.format(title, name),y=1.02, fontsize=16)
    fig.tight_layout()
    if save:
        fig.savefig(save, dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
def return_confusion(classifiers, X_train, X_test, y_train, y_test):
    confmats = []
    for classifier in classifiers:
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
        confmats.append(confmat)
    return confmats

In [None]:
def plot_confusion(confmats, num, save=False):
    scores = confmats[num]['score']
    name = confmats[num]['name']
    
    plt.figure(figsize=(6,6))
    for ax_num, classifier in zip([221,222,223,224], scores.keys()):
        ax = plt.subplot(ax_num)
        ax.title.set_text(classifier)
        confmat = scores[classifier]

        ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
        for i in range(confmat.shape[0]):
            for j in range(confmat.shape[1]):
                ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
        
        plt.xlabel('predicted label')
        plt.ylabel('true label')

    plt.suptitle('Confusion Matrix of Baseline (15 Features) Approach on\n\"{}\" column'.format(name), y=1.05, fontsize=16)
    plt.tight_layout()
    if save:
        plt.savefig(save, dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
def tree_visualization(num, dt, features, target, save_name):
    columns = features.columns
    dt.fit(features.values, target)

    export_graphviz(dt['clf'], out_file=save_name, feature_names=columns, filled=True, rounded=True, 
                    special_characters=True)

In [45]:
data = pd.read_excel("NAFLD_en.xlsx")
features_df = data.iloc[:,1:-9]
targets_df = data.iloc[:,-9:]

combined_fibrosis = targets_df.iloc[:,3].values
counter = 2
for trgt in targets_df.iloc[:,4:7].columns:
    for idx in range(targets_df[trgt].values.shape[0]):
        if targets_df[trgt].values[idx] == 0:
            continue
        else:
            combined_fibrosis[idx] = counter
    counter+=1
    
# lb = LabelBinarizer()
# bnd_labels = lb.fit_transform(combined_fibrosis)
new_targets_df = targets_df.drop(list(targets_df.iloc[:, 3:7].columns),1)

In [16]:
for clm in targets_df.columns[-2:]:
    targets_df[clm] = targets_df[clm].map({1:0, 2:1})

In [17]:
missing_val_counts = {}
for col in features_df.columns:
    missing_val_counts[col] = features_df[col].isnull().sum()

In [18]:
baseline_features = []
for col, value in missing_val_counts.items():
    if value == 0:
        baseline_features.append(col)
baseline_features = features_df[baseline_features]

In [19]:
dropped_baseline = baseline_features.drop(['Steatosis', 'Activity'], 1)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(dropped_baseline.values, combined_fibrosis,
                                                                test_size = 0.3, random_state = 0, stratify=combined_fibrosis)

f1_scores = return_f1_scores(dropped_baseline.values, combined_fibrosis, X_train, y_train)

In [54]:
f1_scores[:4]

((0.3649612937939893, 0.05578263405145976),
 (0.37710961840540846, 0.03805446396650743),
 (0.38427117776215647, 0.05106353901121417),
 (0.36270383677436707, 0.044026929724593565))

In [None]:
plot_baseline_f1_scores(0, save="baseline_avarage_f1_0")

In [None]:
plot_baseline_f1_scores(1, save="baseline_avarage_f1_1")