# COMP5318 Assignment 1: Classification

### Group number: 74  , SID1: 530601364 , SID2: 530208163

In [23]:
# Import all libraries
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [24]:
# Load dataset
breast_cancer_df = pd.read_csv("breast-cancer-wisconsin.csv")

In [25]:
# Pre-process dataset
breast_cancer_df = breast_cancer_df.replace('?', np.nan)

# Replacing missing values with mean value of the column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(breast_cancer_df.iloc[:, :-1])
breast_cancer_df.iloc[:, :-1] = imputer.transform(breast_cancer_df.iloc[:, :-1])

# Normalising the values between [0,1]
scaler = MinMaxScaler()
scaler.fit(breast_cancer_df.iloc[:, :-1])
breast_cancer_df.iloc[:, :-1] = scaler.transform(breast_cancer_df.iloc[:, :-1])

# Changing the class values to 0 and 1 respectively
breast_cancer_df = breast_cancer_df.replace('class1', '0')
breast_cancer_df = breast_cancer_df.replace('class2', '1')
breast_cancer_df["class"] = breast_cancer_df["class"].astype(int)
    

  breast_cancer_df.iloc[:, :-1] = imputer.transform(breast_cancer_df.iloc[:, :-1])


In [26]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

x = breast_cancer_df.drop('class', axis=1).values
y = y = breast_cancer_df['class'].values

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])
            

print_data(x, y)

0.4444,0.0000,0.0000,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.4444,0.3333,0.3333,0.4444,0.6667,1.0000,0.2222,0.1111,0.0000,0
0.2222,0.0000,0.0000,0.0000,0.1111,0.1111,0.2222,0.0000,0.0000,0
0.5556,0.7778,0.7778,0.0000,0.2222,0.3333,0.2222,0.6667,0.0000,0
0.3333,0.0000,0.0000,0.2222,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.7778,1.0000,1.0000,0.7778,0.6667,1.0000,0.8889,0.6667,0.0000,1
0.0000,0.0000,0.0000,0.0000,0.1111,1.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.1111,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.4444,0
0.3333,0.1111,0.0000,0.0000,0.1111,0.0000,0.1111,0.0000,0.0000,0


### Part 1: Cross-validation without parameter tuning

In [27]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [28]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

def logregClassifier(X, y):
    logreg = LogisticRegression(solver='liblinear')
    scores = cross_val_score(logreg, X, y, cv=cvKFold)
    return scores.mean()

In [29]:
#Naïve Bayes
from sklearn.naive_bayes import GaussianNB

def nbClassifier(X, y):
    nb = GaussianNB()
    scores = cross_val_score(nb, X, y, cv=cvKFold)
    return scores.mean()

In [30]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

def dtClassifier(X, y):
    tree = DecisionTreeClassifier(criterion='entropy', random_state = 0)
    scores = cross_val_score(tree, X, y, cv=cvKFold)
    return scores.mean()

In [31]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    bag_clt = BaggingClassifier(DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=0), n_estimators = n_estimators, max_samples = max_samples, random_state=0)
    scores = cross_val_score(bag_clt, X, y, cv=cvKFold)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    ada_clt = AdaBoostClassifier(DecisionTreeClassifier(criterion='entropy', max_depth=max_depth), n_estimators = n_estimators, learning_rate = learning_rate, random_state=0)
    scores = cross_val_score(ada_clt, X, y, cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    gb_clt = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    scores = cross_val_score(gb_clt, X, y, cv=cvKFold)
    return scores.mean()

### Part 1 Results

In [32]:
# Parameters for Part 1:

#Bagging
bag_n_estimators = 60
bag_max_samples = 100
bag_max_depth = 6

#AdaBoost
ada_n_estimators = 60
ada_learning_rate = 0.5
ada_bag_max_depth = 6

#GB
gb_n_estimators = 60
gb_learning_rate = 0.5

x = breast_cancer_df.iloc[:, :-1]
y = breast_cancer_df.iloc[:, -1]

# Print results for each classifier in part 1 to 4 decimal places here:
print("LogR average cross-validation accuracy: {:.4f}".format(logregClassifier(x, y)))
print("NB average cross-validation accuracy: {:.4f}".format(nbClassifier(x, y)))
print("DT average cross-validation accuracy: {:.4f}".format(dtClassifier(x, y)))
print("Bagging average cross-validation accuracy {:.4f}".format(bagDTClassifier(x, y, bag_n_estimators, bag_max_samples, bag_max_depth)))
print("AdaBoost average cross-validation accuracy: {:.4f}".format(adaDTClassifier(x, y, ada_n_estimators, ada_learning_rate, ada_bag_max_depth)))
print("GB average cross-validation accuracy: {:.4f}".format(gbClassifier(x, y, gb_n_estimators, gb_learning_rate)))

LogR average cross-validation accuracy: 0.9657
NB average cross-validation accuracy: 0.9585
DT average cross-validation accuracy: 0.9385
Bagging average cross-validation accuracy 0.9571
AdaBoost average cross-validation accuracy: 0.9599
GB average cross-validation accuracy: 0.9613


### Part 2: Cross-validation with parameter tuning

In [33]:
# KNN
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

k = [1, 3, 5, 7, 9]
p = [1, 2]
param_grid_knn = {'n_neighbors': k,
              'p': p}

def bestKNNClassifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)

    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=cvKFold,
                          return_train_score=True)


    grid_search.fit(X_train, y_train) 
    
    test_set_score = grid_search.score(X_test, y_test)
    best_params = grid_search.best_params_
    best_cross_validation_score = grid_search.best_score_
    return best_params['n_neighbors'], best_params['p'], best_cross_validation_score, test_set_score

In [34]:
# SVM
# You should use SVC from sklearn.svm with kernel set to 'rbf'
from sklearn.svm import SVC

C = [0.01, 0.1, 1, 5, 15] 
gamma = [0.01, 0.1, 1, 10, 50]
param_grid_svm = {'C': C, 'gamma': gamma}

def bestSVMClassifier(X, y):    
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)
   
    grid_search = GridSearchCV(SVC(kernel="rbf"), param_grid_svm, cv=cvKFold,
                          return_train_score=True)
    grid_search.fit(X_train, y_train)

    test_set_score = grid_search.score(X_test, y_test)
    best_params = grid_search.best_params_
    best_cross_validation_score = grid_search.best_score_
    
    return best_params['C'], best_params['gamma'], best_cross_validation_score, test_set_score

In [35]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import *
import pandas as pd

n_estimators = [10, 30, 60, 100, 150]
max_leaf_nodes = [6, 12, 18]
param_grid_rfc = {'n_estimators': n_estimators, 
              'max_leaf_nodes': max_leaf_nodes}

def bestRFClassifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=0
    )

    grid_search_rfc = GridSearchCV(RandomForestClassifier(criterion='entropy', random_state=0), param_grid_rfc, cv=cvKFold,
                               return_train_score=True)
    
    grid_search_rfc.fit(X_train, y_train)
    
    test_set_score = grid_search_rfc.score(X_test, y_test)
    best_params = grid_search_rfc.best_params_
    best_cross_validation_score = grid_search_rfc.best_score_

#classification report to find macro avg F1 score, weighted avg F1 score
    rf = RandomForestClassifier(criterion='entropy', random_state=0)
    rf.fit(X_train, y_train)
    actual = y_test
    predicted = rf.predict(X_test)
    predictions = metrics.classification_report(actual, predicted, output_dict=True)
    macro_avg_f1 = predictions['macro avg']['f1-score']
    weighted_avg_f1 = predictions['weighted avg']['f1-score']
    return best_params['n_estimators'], best_params['max_leaf_nodes'], best_cross_validation_score, test_set_score, macro_avg_f1, weighted_avg_f1

### Part 2: Results

In [36]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.

x = breast_cancer_df.iloc[:, :-1]
y = breast_cancer_df.iloc[:, -1]
best_k, best_p, cross_val_score, test_set_acc = bestKNNClassifier(x,y)

print("KNN best k: {}".format(best_k))
print("KNN best p: {}".format(best_p))
print("KNN cross-validation accuracy: {:.4f}".format(cross_val_score))
print("KNN test set accuracy: {:.4f}".format(test_set_acc))

print()

best_c_parm, best_gam_parm, cross_val_accuracy, test_set_accuracy = bestSVMClassifier(x, y)
print("SVM best C: {:.4f}".format(best_c_parm))
print("SVM best gamma: {:.4f}".format(best_gam_parm))
print("SVM cross-validation accuracy: {:.4f}".format(cross_val_accuracy))
print("SVM test set accuracy: {:.4f}".format(test_set_accuracy))

print()

best_n_est, max_leaf_nodes, cross_val_acc, test_set_acc, test_set_macro_f1, test_set_macro_avg = bestRFClassifier(x,y)
print("RF best n_estimators: {}".format(best_n_est))
print("RF best max_leaf_nodes: {}".format(max_leaf_nodes))
print("RF cross-validation accuracy: {:.4f}".format(cross_val_acc))
print("RF test set accuracy: {:.4f}".format(test_set_acc))
print("RF test set macro average F1: {:.4f}".format(test_set_macro_f1))
print("RF test set weighted average F1: {:.4f}".format(test_set_macro_avg))

KNN best k: 3
KNN best p: 1
KNN cross-validation accuracy: 0.9695
KNN test set accuracy: 0.9543

SVM best C: 5.0000
SVM best gamma: 0.1000
SVM cross-validation accuracy: 0.9676
SVM test set accuracy: 0.9714

RF best n_estimators: 150
RF best max_leaf_nodes: 6
RF cross-validation accuracy: 0.9675
RF test set accuracy: 0.9657
RF test set macro average F1: 0.9689
RF test set weighted average F1: 0.9717
