In [84]:
import numpy as np
import pandas as pd
import sklearn
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

## 0. Loading DataSet

In [67]:
data_set = np.genfromtxt('magic04.data', delimiter=',', dtype=str) # Data is in the form of array of tuples
labels = data_set[:, len(data_set[0]) - 1:len(data_set[0])]

# 1. Data Balancing

In [68]:
under_sampler = RandomUnderSampler()
sampled_data, sampled_labels = under_sampler.fit_resample(data_set, labels)

In [69]:
unique, counts = np.unique(labels, return_counts=True)
print(dict(zip(unique, counts)))
unique, counts = np.unique(sampled_labels, return_counts=True)
print(dict(zip(unique, counts)))

{'g': 12332, 'h': 6688}
{'g': 6688, 'h': 6688}


# 2. Data Split


In [81]:
training_data_set, testing_data_set = train_test_split(data_set, test_size=0.3)

In [82]:
training_data = np.array(training_data_set[:, 0:len(training_data_set[0]) - 1]).astype(np.float64)
training_labels = training_data_set[:, len(training_data_set[0]) - 1:len(training_data_set[0])]
training_labels = np.reshape(training_labels, len(training_labels))

In [83]:
testing_data = np.array(testing_data_set[:, 0:len(testing_data_set[0]) - 1]).astype(np.float64)
testing_labels = testing_data_set[:, len(testing_data_set[0]) - 1:len(testing_data_set[0])]
testing_labels = np.reshape(testing_labels, len(testing_labels))

# 3. Classification

In [128]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

### (a) Decision Tree

In [129]:
def decision_tree(tr_data, tr_labels, tst_data):
    decision_tree = tree.DecisionTreeClassifier()
    decision_tree_pred = decision_tree.fit(tr_data, tr_labels).predict(tst_data)
    return decision_tree_pred

### (b) AdaBoost

In [130]:
def ada_boost(tr_data, tr_labels, tst_data, n_estimators):
    ada_boost = AdaBoostClassifier(n_estimators=n_estimators)
    ada_boost_pred = ada_boost.fit(tr_data, tr_labels).predict(tst_data)
    return ada_boost_pred
    

### (c) K-Nearest Neighbors (K-NN)

In [131]:
def knn(tr_data, tr_labels, tst_data, k_neighb = 3):
    neigh = KNeighborsClassifier(n_neighbors=k_neighb)
    knn_pred = neigh.fit(tr_data, tr_labels).predict(tst_data)
    return knn_pred

### (d) Random Forests


In [132]:
def random_forests(tr_data, tr_labels, tst_data, n_estimators):
    random_forests = RandomForestClassifier(n_estimators=n_estimators)
    random_forests_pred = random_forests.fit(tr_data, tr_labels).predict(tst_data)
    return random_forests_pred
    

### (e) Na¨ıve Bayes


In [133]:
def naive_bayes(tr_data, tr_labels, tst_data):
    gnb = GaussianNB()
    naive_bayes_pred = gnb.fit(tr_data, tr_labels).predict(tst_data)
    return naive_bayes_pred

# 4. Model Parameter Tuning


In [134]:
decision_tree_pred = decision_tree(training_data, training_labels, testing_data)
print("Decision tree accuracy: ", accuracy_score(testing_labels, decision_tree_pred) * 100, "%")
precision_score(testing_labels, decision_tree_pred, average='weighted')
recall_score(testing_labels, decision_tree_pred, average='weighted')
f1_score(testing_labels, decision_tree_pred, average='weighted')

Decision tree accuracy:  81.77357167893446 %


0.8173872838925176

In [135]:
naive_bayes_pred = naive_bayes(training_data, training_labels, testing_data)
print("Na¨ıve bayes accuracy: ", accuracy_score(testing_labels, naive_bayes_pred) * 100, "%")
precision_score(testing_labels, naive_bayes_pred, average='weighted')
recall_score(testing_labels, naive_bayes_pred, average='weighted')
f1_score(testing_labels, naive_bayes_pred, average='weighted')

Na¨ıve bayes accuracy:  72.94076410795654 %


0.7037757415954962

In [136]:
k_neighb = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
for k in k_neighb:
    knn_pred = knn(training_data, training_labels, testing_data, k)
    print(str(k) + "-NN accuracy: ", accuracy_score(testing_labels, knn_pred) * 100, "%")
    precision_score(testing_labels, knn_pred, average='weighted')
    recall_score(testing_labels, knn_pred, average='weighted')
    f1_score(testing_labels, knn_pred, average='weighted')

1-NN accuracy:  78.42621801612339 %
2-NN accuracy:  79.21486154924641 %
3-NN accuracy:  79.67052225727305 %
4-NN accuracy:  79.96845425867508 %
5-NN accuracy:  80.354013319313 %
6-NN accuracy:  80.5292674377848 %
7-NN accuracy:  81.19523308797757 %
8-NN accuracy:  81.45811426568524 %
9-NN accuracy:  81.61584297230985 %
10-NN accuracy:  81.44058885383807 %
11-NN accuracy:  81.61584297230985 %
12-NN accuracy:  81.31791097090782 %
13-NN accuracy:  81.4055380301437 %
14-NN accuracy:  81.35296179460218 %
15-NN accuracy:  81.44058885383807 %
16-NN accuracy:  81.10760602874167 %
17-NN accuracy:  81.66841920785139 %
18-NN accuracy:  81.17770767613038 %
19-NN accuracy:  81.44058885383807 %
20-NN accuracy:  81.16018226428321 %


In [137]:
n_estimators = [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
for n in n_estimators:
    ada_boost_pred = ada_boost(training_data, training_labels, testing_data, n)
    print(str(n) + "_estimators AdaBoost accuracy: ", accuracy_score(testing_labels, ada_boost_pred) * 100, "%")
    precision_score(testing_labels, ada_boost_pred, average='weighted')
    recall_score(testing_labels, ada_boost_pred, average='weighted')
    f1_score(testing_labels, ada_boost_pred, average='weighted')

90_estimators AdaBoost accuracy:  84.43743427970557 %
91_estimators AdaBoost accuracy:  84.45495969155276 %
92_estimators AdaBoost accuracy:  84.4199088678584 %
93_estimators AdaBoost accuracy:  84.43743427970557 %
94_estimators AdaBoost accuracy:  84.40238345601122 %
95_estimators AdaBoost accuracy:  84.61268839817735 %
96_estimators AdaBoost accuracy:  84.5075359270943 %
97_estimators AdaBoost accuracy:  84.54258675078864 %
98_estimators AdaBoost accuracy:  84.54258675078864 %
99_estimators AdaBoost accuracy:  84.54258675078864 %
100_estimators AdaBoost accuracy:  84.5075359270943 %


In [138]:
n_estimators = [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
for n in n_estimators:
    random_forests_pred = random_forests(training_data, training_labels, testing_data, n)
    print(str(n) + "_estimators random forests accuracy: ", accuracy_score(testing_labels, random_forests_pred) * 100, "%")
    precision_score(testing_labels, random_forests_pred, average='weighted')
    recall_score(testing_labels, random_forests_pred, average='weighted')
    f1_score(testing_labels, random_forests_pred, average='weighted')

90_estimators random forests accuracy:  87.45180511742025 %
91_estimators random forests accuracy:  87.48685594111461 %
92_estimators random forests accuracy:  87.45180511742025 %
93_estimators random forests accuracy:  87.32912723449002 %
94_estimators random forests accuracy:  87.45180511742025 %
95_estimators random forests accuracy:  87.55695758850332 %
96_estimators random forests accuracy:  87.73221170697512 %
97_estimators random forests accuracy:  87.46933052926744 %
98_estimators random forests accuracy:  87.67963547143358 %
99_estimators random forests accuracy:  87.57448300035051 %
100_estimators random forests accuracy:  87.73221170697512 %


# 5. Report Requirements

# 6. Bonus