# Homework 6 of Machine Learning

## PPG-DaLiA reference:

- https://archive.ics.uci.edu/ml/datasets/PPG-DaLiA
- https://archive.ics.uci.edu/ml/machine-learning-databases/00495/readme.pdf

## Scikit-learn reference:

- https://scikit-learn.org/stable/index.html

## PyTorch reference:

- https://pytorch.org/docs/stable/index.html

In [None]:
# read the csv file with pandas

import pandas as pd

train_df = pd.read_csv("data/train_data.csv")
test_df = pd.read_csv("data/test_data.csv")

print(train_df.shape)
print(test_df.shape)

## 1    [10pts] Dataset Analysis and Preprocessing

In [None]:
# 1.1 [2pts] Output the features in training and test data and find the difference

# Fill in None in the following code.

print("Training data:")
train_features = None
for fea in train_features:
    print(f"Feature: {fea}, Type: {None}")

print("\nTest data:")
test_features = None
for fea in test_features:
    print(f"Feature: {fea}, Type: {None}")

print(f"\nDifference between train_features and test_features: {None}")

In [None]:
# 1.2 [2pts] Feature analysis

# Fill in None in the following code.

def feature_analysis(df, features):
    for fea in features:
        fea_name = fea.ljust(11, " ")
        if df[fea].dtype != object:
            print(
                f"Feature: {fea_name},\t Type: {None},\t",
                f"Min: {round(None, 3)},\t Max: {round(None, 3)},\t",
                f"Mean: {round(None, 3)},\t Std: {round(None, 3)},\t",
                f"unique_num: {None}, \t null_num: {None}"
            )
        else:
            print(
                "-"*100,
                f"\nFeature: {fea_name},\t Type: {None},\t",
                f"unique_num: {None},\t null_num: {None}"
            )
            unique_value_list = None
            for value in unique_value_list:
                print(f"value: {value},\t num: {(df[fea] == value).sum()}")
            print("-"*100)

print("Training data:")
feature_analysis(train_df, train_features)

print(f"\n{'='*130}\n")
print("Test data:")
feature_analysis(test_df, test_features)

In [None]:
# 1.3 [2pts] Feature encoder

# Hint: use sklearn.preprocessing.LabelEncoder

for fea in train_features:
    if train_df[fea].dtype == object:
        pass

In [None]:
# 1.4 [2pts] Feature distribution

import seaborn as sns
import matplotlib.pyplot as plt

# Bar chart for activity in training data
plt.bar() # TODO
plt.xlabel('Activity')
plt.ylabel('Count')
plt.title('Bar Chart of Activity in Training Data')
plt.show()

# Histogram for heart_rate in training data
sns.histplot() # TODO
plt.xlabel('Heart Rate')
plt.ylabel('Count')
plt.title('Histogram of Heart Rate in Training Data')
plt.show()

In [None]:
# 1.5 [2pts] Heatmap of feature correlation

sns.heatmap() # TODO
plt.title('Feature Correlation of Training Data')
plt.show()

## 2    [15pts] K-Fold Cross Validation

In [None]:
import math
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

X_train = train_df.values[:, :-2].astype(np.float64)
y_train = train_df.values[:, -2].astype(np.int32)

X_test = test_df.values[:, :-1].astype(np.float64)
y_test = test_df.values[:, -1].astype(np.int32)

X_train = normalize(X_train, axis=0, norm="max")
X_test = normalize(X_test, axis=0, norm="max") 

In [None]:
## Using sklearn, we can implement Logistic Regression easily 
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(C=1.0, max_iter=500)

lr_model.fit(X_train, y_train) 
# predicted probability on test set using logistic regression model
y_pred_train_lr = lr_model.predict(X_train)
y_prob_lr = lr_model.predict_proba(X_test)
y_pred_test_lr = lr_model.predict(X_test)
print("Logistic Regression:\t acc on train is %.4f ; acc on test is %.4f ; AUC on test is %.4f" 
        %(accuracy_score(y_train, y_pred_train_lr), accuracy_score(y_test, y_pred_test_lr), roc_auc_score(y_test, y_prob_lr, multi_class='ovr'))) 

## However, we do not consider the hyperparameter C

In [None]:
# 2.1 [6pts] K-fold Cross Validation using accuracy
## Choosing the hyper parameter C

num_folds = 5
C_choices = [0.1, 1, 10, 100, 1000]

X_train_folds = []
y_train_folds = []
################################################################################
# TODO:                                                                        #
# Split up the training data into folds. After splitting, X_train_folds and    #
# y_train_folds should each be lists of length num_folds, where                #
# y_train_folds[i] is the label vector for the points in X_train_folds[i].     #
# Hint: Look up the numpy array_split function.                                #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****


# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# A dictionary holding the accuracies for different values of C that we find
# when running cross-validation. After running cross-validation,
# C_to_accuracies[C] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of C.
C_to_accuracies = dict()


################################################################################
# TODO:                                                                        #
# Perform k-fold cross validation to find the best value of C. For each        #
# possible value of C, run the LogisticRegression algorithm num_folds times,   #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all     #
# values of C in the C_to_accuracies dictionary.                               #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# Print out the computed accuracies
for C in sorted(C_to_accuracies):
    for accuracy in C_to_accuracies[C]:
        print('C = %.2f, accuracy = %f' % (C, accuracy))

In [None]:
# 2.2 [2pts] Plot the result of 2.1

# plot the raw observations
for idx, C in enumerate(C_choices, 1):
    accuracies = C_to_accuracies[C]
    plt.scatter([math.log10(C)] * len(accuracies), accuracies)

# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for _,v in sorted(C_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for _,v in sorted(C_to_accuracies.items())])
plt.errorbar([math.log10(i) for i in C_choices], accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on C with accuracy')
plt.xlabel('log C')
plt.ylabel('Cross-validation accuracy')
plt.show()

- Based on the result, which C value will you choose?
[Put your answer here]

In [None]:
 # 2.3 [5pts] K-fold Cross Validation using AUC

num_folds = 5
C_choices = [0.1, 1, 10, 100, 1000]

X_train_folds = []
y_train_folds = []
################################################################################
# TODO:                                                                        #
# Split up the training data into folds. After splitting, X_train_folds and    #
# y_train_folds should each be lists of length num_folds, where                #
# y_train_folds[i] is the label vector for the points in X_train_folds[i].     #
# Hint: Look up the numpy array_split function.                                #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# A dictionary holding the accuracies for different values of C that we find
# when running cross-validation. After running cross-validation,
# C_to_AUC[C] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of C.
C_to_AUC = dict()


################################################################################
# TODO:                                                                        #
# Perform k-fold cross validation to find the best value of C. For each        #
# possible value of C, run the LogisticRegression algorithm num_folds times,   #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all     #
# values of C in the C_to_AUC dictionary.                                      #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# Print out the computed AUCs
for C in sorted(C_to_AUC):
    for accuracy in C_to_AUC[C]:
        print('C = %.2f, auc = %f' % (C, auc))

In [None]:
# 2.4 [2pts] Plot the result of 2.3

# plot the raw observations
for idx, C in enumerate(C_choices, 1):
    accuracies = C_to_AUC[C]
    plt.scatter([math.log10(C)] * len(accuracies), accuracies)

# plot the trend line with error bars that correspond to standard deviation
AUCs_mean = np.array([np.mean(v) for _,v in sorted(C_to_AUC.items())])
AUCs_std = np.array([np.std(v) for _,v in sorted(C_to_AUC.items())])
plt.errorbar([math.log10(i) for i in C_choices], AUCs_mean, yerr=AUCs_std)
plt.title('Cross-validation on C with AUC')
plt.xlabel('log C')
plt.ylabel('Cross-validation AUC')
plt.show()

- According to the result, which C value will you choose?
[Put your answer here]

## 3 [15pts] Various Classification Models

In [None]:
# Using C=10, we can get a better LogisticRegression model

lr_model = LogisticRegression(C=10, max_iter=500)
lr_model.fit(X_train, y_train) 
y_pred_train_lr = lr_model.predict(X_train)
y_pred_test_lr = lr_model.predict(X_test) 
y_prob_lr = lr_model.predict_proba(X_test)
print("Logistic Regression:\t acc on train is %.4f ; acc on test is %.4f ; AUC on test is %.4f" 
        %(accuracy_score(y_train, y_pred_train_lr), accuracy_score(y_test, y_pred_test_lr), roc_auc_score(y_test, y_prob_lr, multi_class='ovr'))) 

In [None]:
# 3.1 [2pts] Tree model

tree_model = None

y_pred_train_tree = None
y_pred_test_tree = None
y_prob_tree = None

print("Decision Tree:\t acc on train is %.4f ; acc on test is %.4f ; AUC on test is %.4f" 
        %(accuracy_score(y_train, y_pred_train_tree), accuracy_score(y_test, y_pred_test_tree), roc_auc_score(y_test, y_prob_tree, multi_class='ovr'))) 

In [None]:
# 3.2 [2pts] Multi- Layer Perceptron

MLP_model = None

y_pred_train_mlp = None
y_pred_test_mlp = None
y_prob_mlp = None

print("MLP:\t acc on train is %.4f ; acc on test is %.4f ; AUC on test is %.4f" 
        %(accuracy_score(y_train, y_pred_train_mlp), accuracy_score(y_test, y_pred_test_mlp), roc_auc_score(y_test, y_prob_mlp, multi_class='ovr'))) 

In [None]:
# 3.3 [2pts] Support Vector Machine

svm_model = None

y_pred_train_svm = None
y_pred_test_svm = None
y_prob_svm = None

print("SVM:\t acc on train is %.4f ; acc on test is %.4f ; AUC on test is %.4f" 
        %(accuracy_score(y_train, y_pred_train_svm), accuracy_score(y_test, y_pred_test_svm), roc_auc_score(y_test, y_prob_svm, multi_class='ovr'))) 

In [None]:
# 3.4 [2pts] Naive Bayes

bayesian_model = None

y_pred_train_nb = None
y_pred_test_nb = None
y_prob_nb = None

print("Naive Bayes:\t acc on train is %.4f ; acc on test is %.4f ; AUC on test is %.4f" 
        %(accuracy_score(y_train, y_pred_train_nb), accuracy_score(y_test, y_pred_test_nb), roc_auc_score(y_test, y_prob_nb, multi_class='ovr'))) 

In [None]:
# 3.5 [2pts] Random Forest

rf_model = None

y_pred_train_rf = None
y_pred_test_rf = None
y_prob_rf = None

print("Random Forest:\t acc on train is %.4f ; acc on test is %.4f ; AUC on test is %.4f" 
        %(accuracy_score(y_train, y_pred_train_rf), accuracy_score(y_test, y_pred_test_rf), roc_auc_score(y_test, y_prob_rf, multi_class='ovr'))) 

In [None]:
# 3.6 [2pts] LightGBM

LightGBM_model = None

y_pred_train_gbm = None
y_pred_test_gbm = None
y_prob_gbm = None

print("LightGBM:\t acc on train is %.4f ; acc on test is %.4f ; AUC on test is %.4f" 
        %(accuracy_score(y_train, y_pred_train_gbm), accuracy_score(y_test, y_pred_test_gbm), roc_auc_score(y_test, y_prob_gbm, multi_class='ovr'))) 

In [None]:
# 3.7 [3pts] Plot ROCs in one image

################################################################################
# TODO:                                                                        #
# Plot the ROC on test set using ovr for all models above                      #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

- From the result, which model performs well?
[Put your answer here]
- And which model performs poor?
[Put your answer here]

## 4 [15pts] Model Combination Strategies

In [None]:
# 4.1 [5pts] Voting

y_pred_train_voting = None
y_pred_test_voting = None

print("Voting strategy:\t acc on train is %.4f ; acc on test is %.4f" 
        %(accuracy_score(y_train, y_pred_train_voting), accuracy_score(y_test, y_pred_test_voting))) 

In [None]:
# 4.2 [10pts] Stacking

y_pred_train_stacking = None
y_pred_test_stacking = None

print("Stacking strategy:\t acc on train is %.4f ; acc on test is %.4f" 
        %(accuracy_score(y_train, y_pred_train_stacking), accuracy_score(y_test, y_pred_test_stacking)))

## 5 [45pts] Regression Task in Practice

Write your code below.