# COMP5318 Assignment 1: Classification

### Group number: 74  , SID1: 530601364 , SID2: Lee's  

In [76]:
# Import all libraries
from sklearn.model_selection import StratifiedKFold
import pandas as pd

import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# to make this notebook's output stable across runs
np.random.seed(42)

In [77]:
# Load dataset
breast_cancer_df = pd.read_csv("test-before.csv")
breast_cancer_df.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,class
0,5.88,0.4874,0.541,1.515,16.55,0.3458,class1
1,76.47,0.7286,0.6721,1.919,13.0,0.3308,class1
2,29.41,0.5879,?,0.0,0.0,0.5082,class1
3,29.41,0.5477,0.6148,2.626,0.0,0.5365,class1
4,17.65,0.794,0.623,3.636,28.96,?,class2


In [78]:
breast_cancer_df = breast_cancer_df.replace('?', np.nan)

In [79]:
# Pre-process dataset

# Replacing missing values with mean value of the column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(breast_cancer_df.iloc[:, :-1])
breast_cancer_df.iloc[:, :-1] = imputer.transform(breast_cancer_df.iloc[:, :-1])

# Normalising the values between [0,1]
breast_cancer_df.iloc[:, :-1] = MinMaxScaler().fit_transform(breast_cancer_df.iloc[:, :-1])

# Changing the class values to 0 and 1 respectively
breast_cancer_df = breast_cancer_df.replace('class1', '0')
breast_cancer_df = breast_cancer_df.replace('class2', '1')
breast_cancer_df["class"] = breast_cancer_df["class"].astype(int)
breast_cancer_df.head(25)
    

  breast_cancer_df.iloc[:, :-1] = imputer.transform(breast_cancer_df.iloc[:, :-1])


Unnamed: 0,a1,a2,a3,a4,a5,a6,class
0,0.062078,0.499949,0.541,0.207933,0.259404,0.061258,0
1,0.807327,0.747359,0.6721,0.263382,0.203762,0.058601,0
2,0.310494,0.603036,0.418738,0.0,0.0,0.090027,0
3,0.310494,0.561801,0.6148,0.360417,0.0,0.09504,0
4,0.186339,0.814443,0.623,0.499039,0.453918,0.159728,1
5,0.186339,0.603854,0.4754,0.152484,0.1,0.065474,1
6,0.683171,0.711355,0.623,0.0,0.0,0.087653,1
7,0.55891,0.525798,0.623,0.512901,0.0,0.086856,0
8,0.124155,0.463945,0.5574,0.582212,0.0,0.10085,1
9,0.248416,0.572161,0.5902,0.651523,0.383542,0.097945,0


In [80]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

x = breast_cancer_df.drop('class', axis=1).values
y = y = breast_cancer_df['class'].values

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])
            

print_data(x, y)

0.0621,0.4999,0.5410,0.2079,0.2594,0.0613,0
0.8073,0.7474,0.6721,0.2634,0.2038,0.0586,0
0.3105,0.6030,0.4187,0.0000,0.0000,0.0900,0
0.3105,0.5618,0.6148,0.3604,0.0000,0.0950,0
0.1863,0.8144,0.6230,0.4990,0.4539,0.1597,1
0.1863,0.6039,0.4754,0.1525,0.1000,0.0655,1
0.6832,0.7114,0.6230,0.0000,0.0000,0.0877,1
0.5589,0.5258,0.6230,0.5129,0.0000,0.0869,0
0.1242,0.4639,0.5574,0.5822,0.0000,0.1009,1
0.2484,0.5722,0.5902,0.6515,0.3835,0.0979,0


### Part 1: Cross-validation without parameter tuning

In [81]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [82]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

def logregClassifier(X, y):
    logreg = LogisticRegression(solver='liblinear')
    scores = cross_val_score(logreg, X, y, cv=cvKFold)
    return scores.mean()

x = breast_cancer_df.iloc[:, :-1]
y = breast_cancer_df.iloc[:, -1]

print("Average cross-validation score for logistic regression: {:.4f}".format(logregClassifier(x, y)))

Average cross-validation score for logistic regression: 0.6510


In [83]:
#Naïve Bayes
from sklearn.naive_bayes import GaussianNB


def nbClassifier(X, y):
    nb = GaussianNB()
    scores = cross_val_score(nb, X, y, cv=cvKFold)
    return scores.mean()

x = breast_cancer_df.iloc[:, :-1]
y = breast_cancer_df.iloc[:, -1]

print("Average cross-validation score for naive bayes: {:.4f}".format(nbClassifier(x, y)))

Average cross-validation score for naive bayes: 0.6555


In [84]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier


def dtClassifier(X, y):
    
    tree = DecisionTreeClassifier(criterion='entropy', random_state = 42)
    scores = cross_val_score(tree, X, y, cv=cvKFold)
    return scores.mean()

x = breast_cancer_df.iloc[:, :-1]
y = breast_cancer_df.iloc[:, -1]

print("Average cross-validation score for decision trees: {:.4f}".format(dtClassifier(x,y)))

Average cross-validation score for decision trees: 0.7752


In [85]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    bag_clt = BaggingClassifier(DecisionTreeClassifier(criterion='entropy', max_depth = max_depth, random_state=42), n_estimators = n_estimators, max_samples = max_samples)
    scores = cross_val_score(bag_clt, X, y, cv=cvKFold)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    ada_clt = AdaBoostClassifier(DecisionTreeClassifier(criterion='entropy', max_depth=max_depth), n_estimators = n_estimators, learning_rate = learning_rate, random_state=42)
    scores = cross_val_score(ada_clt, X, y, cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    gb_clt = GradientBoostingClassifier(n_estimators = n_estimators, learning_rate = learning_rate)
    scores = cross_val_score(gb_clt, X, y, cv=cvKFold)
    return scores.mean()


x = breast_cancer_df.iloc[:, :-1]
y = breast_cancer_df.iloc[:, -1]

print("Average cross-validation score for bagDTClassifier: {:.4f}".format(bagDTClassifier(x,y, n_estimators = 500, max_samples = 100, max_depth = 10)))
print("Average cross-validation score for adaDTClassifier: {:.4f}".format(adaDTClassifier(x,y, n_estimators = 200, learning_rate = 0.5, max_depth = 10)))
print("Average cross-validation score for gbClassifier: {:.4f}".format(gbClassifier(x,y, n_estimators = 200, learning_rate = 0.2)))


Average cross-validation score for bagDTClassifier: 0.7562
Average cross-validation score for adaDTClassifier: 0.6845
Average cross-validation score for gbClassifier: 0.7319


### Part 1 Results

In [86]:
# Parameters for Part 1:

#Bagging
bag_n_estimators = 60
bag_max_samples = 100
bag_max_depth = 6

#AdaBoost
ada_n_estimators = 60
ada_learning_rate = 0.5
ada_bag_max_depth = 6

#GB
gb_n_estimators = 60
gb_learning_rate = 0.5

x = breast_cancer_df.iloc[:, :-1]
y = breast_cancer_df.iloc[:, -1]

# Print results for each classifier in part 1 to 4 decimal places here:
print("LogR average cross-validation accuracy: {:.4f}".format(logregClassifier(x, y)))
print("NB average cross-validation accuracy: {:.4f}".format(nbClassifier(x, y)))
print("DT average cross-validation accuracy: {:.4f}".format(dtClassifier(x, y)))
print("Bagging average cross-validation accuracy {:.4f}".format(bagDTClassifier(x, y, bag_n_estimators, bag_max_samples, bag_max_depth)))
print("AdaBoost average cross-validation accuracy: {:.4f}".format(adaDTClassifier(x, y, ada_n_estimators, ada_learning_rate, ada_bag_max_depth)))
print("GB average cross-validation accuracy: {:.4f}".format(gbClassifier(x, y, gb_n_estimators, gb_learning_rate)))

LogR average cross-validation accuracy: 0.6510
NB average cross-validation accuracy: 0.6555
DT average cross-validation accuracy: 0.7752
Bagging average cross-validation accuracy 0.7657
AdaBoost average cross-validation accuracy: 0.7512
GB average cross-validation accuracy: 0.7464


### Part 2: Cross-validation with parameter tuning

In [87]:
# KNN
k = [1, 3, 5, 7, 9]
p = [1, 2]
param_grid = {'n_neighbors': k,
              'p': p}

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

def bestKNNClassifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)

    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=cvKFold,
                          return_train_score=True)


    grid_search.fit(X_train, y_train) #doing 5 by 2 by 10 total runs    
    
    test_set_score = grid_search.score(X_test, y_test)
    best_params = grid_search.best_params_
    best_cross_validation_score = grid_search.best_score_
    best_estimator = grid_search.best_estimator_
    return best_params['n_neighbors'], best_params['p'], best_cross_validation_score, best_estimator  #(appropriate values so that the required printing can be done)

x = breast_cancer_df.iloc[:, :-1]
y = breast_cancer_df.iloc[:, -1]

print(bestKNNClassifier(x,y))


(1, 1, 0.7329166666666668, KNeighborsClassifier(n_neighbors=1, p=1))


In [15]:
# SVM
# You should use SVC from sklearn.svm with kernel set to 'rbf'
C = [0.01, 0.1, 1, 5, 15] 
gamma = [0.01, 0.1, 1, 10, 50]

def bestSVMClassifier(X, y):
    
    return  #(appropriate values so that the required printing can be done)

In [88]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import *
import pandas as pd

n_estimators = [10, 30, 60, 100, 150]
max_leaf_nodes = [6, 12, 18]
param_grid = {'n_estimators': [10, 30, 60, 100, 150], 
              'max_leaf_nodes': [6, 12, 18]}

def bestRFClassifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=0
    )

    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=cvKFold,
                               return_train_score=True)
    
    grid_search.fit(X_train, y_train)
    
    test_set_score = grid_search.score(X_test, y_test)
    best_params = grid_search.best_params_
    best_cross_validation_score = grid_search.best_score_
    best_esimator = grid_search.best_estimator_
    actual = y_test

    RF = RandomForestClassifier()
    RF.fit(X_train, y_train)
    actual = y_test
    predicted = RF.predict(X_test)
    lst = metrics.classification_report(actual, predicted)
    lst = lst.split()
    df = pd.Series(lst)
    return best_params['n_estimators'], best_params['max_leaf_nodes'], best_cross_validation_score, test_set_score, best_esimator, df[21], df[27]#(appropriate values so that the required printing can be done)
    # two more values, the marco average F1 score and the weighted average F1 score.
x = breast_cancer_df.iloc[:, :-1]
y = breast_cancer_df.iloc[:, -1]

print(bestRFClassifier(x,y))

(150, 18, 0.8074999999999999, 0.6792452830188679, RandomForestClassifier(max_leaf_nodes=18, n_estimators=150), '0.67', '0.68')


### Part 2: Results

In [89]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.

x = breast_cancer_df.iloc[:, :-1]
y = breast_cancer_df.iloc[:, -1]

print("KNN best k: {}".format(bestKNNClassifier(x, y)[0]))
print("KNN best p: {}".format(bestKNNClassifier(x, y)[1]))
print("KNN cross-validation accuracy: {:.4f}".format(bestKNNClassifier(x, y)[2]))
print("KNN test set accuracy: {}".format(bestKNNClassifier(x, y)[3]))

# print()

# print("SVM best C: ")
# print("SVM best gamma: ")
# print("SVM cross-validation accuracy: ")
# print("SVM test set accuracy: ")

# print()

# print("RF best n_estimators: ")
# print("RF best max_leaf_nodes: ")
# print("RF cross-validation accuracy: ")
# print("RF test set accuracy: ")
# print("RF test set macro average F1: ")
# print("RF test set weighted average F1: ")

ValueError: Invalid parameter 'max_leaf_nodes' for estimator KNeighborsClassifier(). Valid parameters are: ['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'].