In [None]:
########## Machine Learning (mostly sklearn) tutorial for mhealth24 ##########

########## What will we talk about?
# commonly used (and useful) classifiers
# how (not) to evaluate your classifier
# imbalanced data

########## What will we not talk about?
# EDA (Explanatory data analysis) is the beginning of any analysis. You have to understand the data your deadling with to develop a successful strategy. Usually, this means: plotting, plotting, plotting...

########## How does this relate to exercise 2 of mhealth24?
# Similar to exercise 1, exercise 2 does not require you to be an ML expert.
# In fact, in large parts exercise 2 requires you to gain a good understanding of the problem and provided data and then develop a logic.
# However, as also indicated by the reading assignments, there are some parts where you'll find that ML classifiers might become very helpful.
# Also, as for all modeling problems, you'll have to think about how you can assess your performance.

########## Useful overview of classifiers
# https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [None]:
########## standard imports
import numpy as np
import pandas as pd

# first, we'll be using the wine dataset as a toy example (https://archive.ics.uci.edu/ml/datasets/wine)
from sklearn.datasets import load_wine

# followed by the breast cancer dataset (https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic)
from sklearn.datasets import load_breast_cancer

# we'll evaluate a decision tree, a random forest, a suport vector machine and a k-nearest neighbour classifier
from sklearn.tree import DecisionTreeClassifier # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
from sklearn.tree import plot_tree # to visualize the decision tree
from sklearn.ensemble import RandomForestClassifier # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.svm import SVC # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.neighbors import KNeighborsClassifier # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

# and also boosted trees
import xgboost as xgb # https://xgboost.readthedocs.io/en/stable/index.html

# to create a test and train split
from sklearn.model_selection import train_test_split

# evaluation metrics: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# for plotting
import matplotlib.pyplot as plt

In [None]:
########## The wine dataset
#  loading and prepping dataset
wine = load_wine()

# some information about the wine
X = pd.DataFrame(wine['data'], columns = wine['feature_names'])

# who cultivated the wine?
y = pd.DataFrame(wine['target'], columns = ['label'])

print('#' * 60, end = '\n\n')
print(X.describe())
print('#' * 60, end = '\n\n')
print(X.head())
print('#' * 60, end = '\n\n')
print(y.head())
print('#' * 60, end = '\n\n')
y.hist()

In [None]:
########## Classifiers with sklearn: Decision Tree
# DT: quick fit with default parameters
clf = DecisionTreeClassifier(max_depth = 2)
clf.fit(X, y)

# visualizing the decision tree
plt.figure(figsize=(10,5))
plot_tree(clf, feature_names = X.columns, max_depth = 2)
plt.show()

In [None]:
# prediction
y_preds = clf.predict(X)
print(y_preds)

In [None]:
######## performance assessment
# How well are we doing?
print(confusion_matrix(y, y_preds))
print()
print(np.round(accuracy_score(y, y_preds), 2))

In [None]:
# Can we improve this choosing different hyper parameters?
# Hyper-parameter tuning
for md in range(1,10):
    clf = DecisionTreeClassifier(max_depth = md)
    clf.fit(X,y)
    y_preds = clf.predict(X)
    print("{}: {} ".format(md, np.round(accuracy_score(y, y_preds), 2)))

In [None]:
# Some more metrics
clf = DecisionTreeClassifier(max_depth = 3)
clf.fit(X,y)
y_preds = clf.predict(X)

ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(y, y_preds), display_labels = wine['target_names']).plot()
plt.show()

print('precision: {}'.format(np.round(precision_score(y, y_preds,average = 'weighted'), 3)))
print(np.round(precision_score(y, y_preds,average = None),3), end = '\n\n') # tp / (tp + fp)

print('recall {}'.format(np.round(recall_score(y, y_preds,average = 'weighted'), 3)))
print(np.round(recall_score(y, y_preds,average = None),3), end = '\n\n') # tp / (tp + fn)

print('f1 {}'.format(np.round(f1_score(y, y_preds,average = 'weighted'), 3)))
print(np.round(f1_score(y, y_preds,average = None),3), end = '\n\n') # 2 * (precision * recall) / (precision + recall)

print('accuracy {}'.format(np.round(accuracy_score(y, y_preds), 3)))
print('balanced accuracy {}'.format(np.round(balanced_accuracy_score(y, y_preds), 3)))

In [None]:
# How well are we actually doing?
# Train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

In [None]:
# Generalization error: (how much) are we overfitting?
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

print("train accuracy:")
train_preds = clf.predict(X_train)
print(confusion_matrix(y_train, train_preds))
print(accuracy_score(y_train, train_preds))

print()
print('#' * 20)
print("test accuracy:")

test_preds = clf.predict(X_test)
print(confusion_matrix(y_test, test_preds))
print(accuracy_score(y_test, test_preds))

In [None]:
######## Cross validating various classifiers
# cross validation: DT
# common parameter(s) to tune: max_depth
accs = list()
for md in np.arange(10)+1:
    train_accs = list()
    test_accs = list()
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = DecisionTreeClassifier(max_depth = md)
        clf.fit(X_train, y_train)

        train_preds = clf.predict(X_train)
        test_preds = clf.predict(X_test)

        train_accs.append(accuracy_score(y_train, train_preds))
        test_accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{} - {}".format(md, np.round(np.mean(train_accs), 2), np.round(np.mean(test_accs), 4)))

In [None]:
# cross validation: RF
# common parameter(s) to tune: max_depth and n_estimators
accs = list()
for md in np.arange(5)+1:
    train_accs = list()
    test_accs = list()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = RandomForestClassifier(max_depth = md, n_estimators = 100)
        clf.fit(X_train, np.array(y_train).ravel())

        train_preds = clf.predict(X_train)
        test_preds = clf.predict(X_test)

        train_accs.append(accuracy_score(y_train, train_preds))
        test_accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{} - {}".format(md, np.round(np.mean(train_accs), 2), np.round(np.mean(test_accs), 4)))

In [None]:
# cross validation: KNN
# common parameter(s) to tune: n_neighbors
accs = list()
for nb in np.arange(3, 20)+1:
    train_accs = list()
    test_accs = list()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = KNeighborsClassifier(n_neighbors = nb)
        clf.fit(X_train, np.array(y_train).ravel())

        train_preds = clf.predict(X_train)
        test_preds = clf.predict(X_test)

        train_accs.append(accuracy_score(y_train, train_preds))
        test_accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{} - {}".format(nb, np.round(np.mean(train_accs), 2), np.round(np.mean(test_accs), 4)))

In [None]:
# cross validation: SVM
# common parameter(s) to tune: kernel and C
accs = list()
for c in np.arange(1, 11)/10:
    train_accs = list()
    test_accs = list()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = SVC(kernel = 'linear', C = c)
        clf.fit(X_train, np.array(y_train).ravel())

        train_preds = clf.predict(X_train)
        test_preds = clf.predict(X_test)

        train_accs.append(accuracy_score(y_train, train_preds))
        test_accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{} - {}".format(c, np.round(np.mean(train_accs), 2), np.round(np.mean(test_accs), 4)))

In [None]:
# cross validation: xgboost classifier
# XGB's default parameters already perform quite well usually
# common parameter(s) to tune: max_depth, eta, subsample (there are many more, however https://xgboost.readthedocs.io/en/stable/parameter.html)

#tuning max_depth
accs = list()
for md in range(5):
    train_accs = list()
    test_accs = list()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = xgb.XGBClassifier(objective="multi:softmax", max_depth = md+1)
        clf.fit(X_train, y_train)

        train_preds = clf.predict(X_train)
        test_preds = clf.predict(X_test)

        train_accs.append(accuracy_score(y_train, train_preds))
        test_accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{} - {}".format(md, np.round(np.mean(train_accs), 2), np.round(np.mean(test_accs), 4)))

In [None]:
#tuning eta
accs = list()
for eta in range(5):
    train_accs = list()
    test_accs = list()
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = xgb.XGBClassifier(objective="multi:softmax", max_depth = 1, eta = (eta + 1)/10)
        clf.fit(X_train, y_train)

        train_preds = clf.predict(X_train)
        test_preds = clf.predict(X_test)

        train_accs.append(accuracy_score(y_train, train_preds))
        test_accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{} - {}".format((eta + 1)/10, np.round(np.mean(train_accs), 2), np.round(np.mean(test_accs), 4)))

In [None]:
#tuning subsample
accs = list()
for subsample in range(5):
    train_accs = list()
    test_accs = list()
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = xgb.XGBClassifier(objective="multi:softmax", max_depth = 1, eta = 0.3, subsample = (subsample + 5)/10)
        clf.fit(X_train, y_train)

        train_preds = clf.predict(X_train)
        test_preds = clf.predict(X_test)

        train_accs.append(accuracy_score(y_train, train_preds))
        test_accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{} - {}".format((subsample + 5)/10, np.round(np.mean(train_accs), 2), np.round(np.mean(test_accs), 4)))

In [None]:
# In light of exercise 2:
# if you spend your time extensively optimizing xgboost parameter settings to squeeze out another 0.2% in accuracy, 
# you're either completely done or you've missed something...

In [None]:
######## Difficulties when dealing with imbalanced data
# Let's look at a more difficult example, where the data is not balanced

breastcancer = load_breast_cancer()

# some information about the wine
X = pd.DataFrame(breastcancer['data'], columns = breastcancer['feature_names'])

# who cultivated the wine?
y = pd.DataFrame(breastcancer['target'], columns = ['label'])

print('#' * 60, end = '\n\n')
print(X.describe())
print('#' * 60, end = '\n\n')
print(X.head())
print('#' * 60, end = '\n\n')
print(y.head())
print('#' * 60, end = '\n\n')
print(breastcancer['target_names'])
y.hist()

In [None]:
# We'll make this more imbalanced by removing 80% of the malignant cases

print('old shape {}'.format(X.shape))
X_mal = X.loc[y['label'] == 0,]
y_mal = y.loc[y['label'] == 0,]

X_ben = X.loc[y['label'] == 1,]
y_ben = y.loc[y['label'] == 1,]

print('old ratio {:.2f}'.format(X_mal.shape[0] / X.shape[0]))

idxs_keep = np.random.choice(np.arange(len(X_mal)), int(len(X_mal)/5), replace = False)

X_mal = X_mal.iloc[idxs_keep, ]
y_mal = y_mal.iloc[idxs_keep, ]

print('new ratio {:.2f}'.format(X_mal.shape[0] / X.shape[0]))

X = pd.concat([X_ben, X_mal])
y = pd.concat([y_ben, y_mal])

print('new shape {}'.format(X.shape))
y.hist()

In [None]:
# Let's try the RF again

accs = list()
for md in np.arange(5)+1:
    train_accs = list()
    test_accs = list()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = RandomForestClassifier(max_depth = md, n_estimators = 100)
        clf.fit(X_train, np.array(y_train).ravel())

        train_preds = clf.predict(X_train)
        test_preds = clf.predict(X_test)

        train_accs.append(accuracy_score(y_train, train_preds))
        test_accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{} - {}".format(md, np.round(np.mean(train_accs), 2), np.round(np.mean(test_accs), 4)))

In [None]:
# Some more metrics: for the training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

clf = RandomForestClassifier(max_depth = 2, n_estimators = 100)
clf.fit(X_train, np.array(y_train).ravel())
y_preds = clf.predict(X_train)

ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(y_train, y_preds), display_labels = breastcancer['target_names']).plot()
plt.show()

print('precision: {}'.format(np.round(precision_score(y_train, y_preds,average = 'weighted'), 3)))
print(np.round(precision_score(y_train, y_preds,average = None),3), end = '\n\n') # tp / (tp + fp)

print('recall {}'.format(np.round(recall_score(y_train, y_preds,average = 'weighted'), 3)))
print(np.round(recall_score(y_train, y_preds,average = None),3), end = '\n\n') # tp / (tp + fn)

print('f1 {}'.format(np.round(f1_score(y_train, y_preds,average = 'weighted'), 3)))
print(np.round(f1_score(y_train, y_preds,average = None),3), end = '\n\n') # 2 * (precision * recall) / (precision + recall)

print('accuracy {}'.format(np.round(accuracy_score(y_train, y_preds), 3)))
print('balanced accuracy {}'.format(np.round(balanced_accuracy_score(y_train, y_preds), 3)))

In [None]:
# Some more metrics: for the test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

clf = RandomForestClassifier(max_depth = 2, n_estimators = 100)
clf.fit(X_train, np.array(y_train).ravel())
y_preds = clf.predict(X_test)

ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(y_test, y_preds), display_labels = breastcancer['target_names']).plot()
plt.show()

print('precision: {}'.format(np.round(precision_score(y_test, y_preds,average = 'weighted'), 3)))
print(np.round(precision_score(y_test, y_preds,average = None),3), end = '\n\n') # tp / (tp + fp)

print('recall {}'.format(np.round(recall_score(y_test, y_preds,average = 'weighted'), 3)))
print(np.round(recall_score(y_test, y_preds,average = None),3), end = '\n\n') # tp / (tp + fn)

print('f1 {}'.format(np.round(f1_score(y_test, y_preds,average = 'weighted'), 3)))
print(np.round(f1_score(y_test, y_preds,average = None),3), end = '\n\n') # 2 * (precision * recall) / (precision + recall)

print('accuracy {}'.format(np.round(accuracy_score(y_test, y_preds), 3)))
print('balanced accuracy {}'.format(np.round(balanced_accuracy_score(y_test, y_preds), 3)))

In [None]:
# There are some techniques you can use, assign weights to data points, up- or down-sample until to get a more balanced split, or use techniques such as SMOTE...