In [33]:
import numpy as np
import pandas as pd
import sklearn
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

## 0. Loading DataSet

In [34]:
data_set = np.genfromtxt('magic04.data', delimiter=',', dtype=str) # Data is in the form of array of tuples
labels = data_set[:, len(data_set[0]) - 1:len(data_set[0])]

# 1. Data Balancing

In [35]:
under_sampler = RandomUnderSampler()
sampled_data, sampled_labels = under_sampler.fit_resample(data_set, labels)

In [36]:
unique, counts = np.unique(labels, return_counts=True)
print(dict(zip(unique, counts)))
unique, counts = np.unique(sampled_labels, return_counts=True)
print(dict(zip(unique, counts)))

{'g': 12332, 'h': 6688}
{'g': 6688, 'h': 6688}


# 2. Data Split


In [37]:
training_data_set, testing_data_set = train_test_split(data_set, test_size=0.3)

In [38]:
training_data = np.array(training_data_set[:, 0:len(training_data_set[0]) - 1]).astype(np.float64)
training_labels = training_data_set[:, len(training_data_set[0]) - 1:len(training_data_set[0])]
training_labels = np.reshape(training_labels, len(training_labels))

In [39]:
testing_data = np.array(testing_data_set[:, 0:len(testing_data_set[0]) - 1]).astype(np.float64)
testing_labels = testing_data_set[:, len(testing_data_set[0]) - 1:len(testing_data_set[0])]
testing_labels = np.reshape(testing_labels, len(testing_labels))

# 3. Classification

In [40]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

### (a) Decision Tree

In [41]:
def decision_tree(tr_data, tr_labels, tst_data):
    decision_tree = tree.DecisionTreeClassifier()
    decision_tree_pred = decision_tree.fit(tr_data, tr_labels).predict(tst_data)
    return decision_tree_pred

### (b) AdaBoost

In [42]:
def ada_boost(tr_data, tr_labels, tst_data, n_estimators):
    ada_boost = AdaBoostClassifier(n_estimators=n_estimators)
    ada_boost_pred = ada_boost.fit(tr_data, tr_labels).predict(tst_data)
    return ada_boost_pred
    

### (c) K-Nearest Neighbors (K-NN)

In [43]:
def knn(tr_data, tr_labels, tst_data, k_neighb = 3):
    neigh = KNeighborsClassifier(n_neighbors=k_neighb)
    knn_pred = neigh.fit(tr_data, tr_labels).predict(tst_data)
    return knn_pred

### (d) Random Forests


In [44]:
def random_forests(tr_data, tr_labels, tst_data, n_estimators):
    random_forests = RandomForestClassifier(n_estimators=n_estimators)
    random_forests_pred = random_forests.fit(tr_data, tr_labels).predict(tst_data)
    return random_forests_pred
    

### (e) Na¨ıve Bayes


In [45]:
def naive_bayes(tr_data, tr_labels, tst_data):
    gnb = GaussianNB()
    naive_bayes_pred = gnb.fit(tr_data, tr_labels).predict(tst_data)
    return naive_bayes_pred

# 4. Model Parameter Tuning


In [46]:
decision_tree_pred = decision_tree(training_data, training_labels, testing_data)
print("Decision tree accuracy: ", accuracy_score(testing_labels, decision_tree_pred) * 100, "%")
precision_score(testing_labels, decision_tree_pred, average='weighted')
recall_score(testing_labels, decision_tree_pred, average='weighted')
f1_score(testing_labels, decision_tree_pred, average='weighted')

Decision tree accuracy:  81.23028391167192 %


0.81274406762659

In [47]:
naive_bayes_pred = naive_bayes(training_data, training_labels, testing_data)
print("Na¨ıve bayes accuracy: ", accuracy_score(testing_labels, naive_bayes_pred) * 100, "%")
precision_score(testing_labels, naive_bayes_pred, average='weighted')
recall_score(testing_labels, naive_bayes_pred, average='weighted')
f1_score(testing_labels, naive_bayes_pred, average='weighted')

Na¨ıve bayes accuracy:  73.39642481598318 %


0.7089796754098223

In [48]:
k_neighb = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
for k in k_neighb:
    knn_pred = knn(training_data, training_labels, testing_data, k)
    print(str(k) + "-NN accuracy: ", accuracy_score(testing_labels, knn_pred) * 100, "%")
    precision_score(testing_labels, knn_pred, average='weighted')
    recall_score(testing_labels, knn_pred, average='weighted')
    f1_score(testing_labels, knn_pred, average='weighted')

1-NN accuracy:  77.40974412898703 %
2-NN accuracy:  78.61899754644234 %
3-NN accuracy:  79.56536978618998 %
4-NN accuracy:  79.70557308096741 %
5-NN accuracy:  80.354013319313 %
6-NN accuracy:  80.21381002453558 %
7-NN accuracy:  80.61689449702068 %
8-NN accuracy:  80.30143708377146 %
9-NN accuracy:  80.354013319313 %
10-NN accuracy:  80.63441990886786 %
11-NN accuracy:  80.51174202593761 %
12-NN accuracy:  80.58184367332632 %
13-NN accuracy:  80.51174202593761 %
14-NN accuracy:  80.45916579039607 %
15-NN accuracy:  80.87977567472836 %
16-NN accuracy:  80.65194532071503 %
17-NN accuracy:  80.5993690851735 %
18-NN accuracy:  80.61689449702068 %
19-NN accuracy:  80.82719943918683 %
20-NN accuracy:  80.58184367332632 %


In [49]:
n_estimators = [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
for n in n_estimators:
    ada_boost_pred = ada_boost(training_data, training_labels, testing_data, n)
    print(str(n) + "_estimators AdaBoost accuracy: ", accuracy_score(testing_labels, ada_boost_pred) * 100, "%")
    precision_score(testing_labels, ada_boost_pred, average='weighted')
    recall_score(testing_labels, ada_boost_pred, average='weighted')
    f1_score(testing_labels, ada_boost_pred, average='weighted')

90_estimators AdaBoost accuracy:  84.13950227830354 %
91_estimators AdaBoost accuracy:  84.24465474938661 %
92_estimators AdaBoost accuracy:  84.22712933753942 %
93_estimators AdaBoost accuracy:  84.22712933753942 %
94_estimators AdaBoost accuracy:  84.24465474938661 %
95_estimators AdaBoost accuracy:  84.20960392569225 %
96_estimators AdaBoost accuracy:  84.3322818086225 %
97_estimators AdaBoost accuracy:  84.27970557308096 %
98_estimators AdaBoost accuracy:  84.24465474938661 %
99_estimators AdaBoost accuracy:  84.3322818086225 %
100_estimators AdaBoost accuracy:  84.43743427970557 %


In [50]:
n_estimators = [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
for n in n_estimators:
    random_forests_pred = random_forests(training_data, training_labels, testing_data, n)
    print(str(n) + "_estimators random forests accuracy: ", accuracy_score(testing_labels, random_forests_pred) * 100, "%")
    precision_score(testing_labels, random_forests_pred, average='weighted')
    recall_score(testing_labels, random_forests_pred, average='weighted')
    f1_score(testing_labels, random_forests_pred, average='weighted')

90_estimators random forests accuracy:  87.34665264633719 %
91_estimators random forests accuracy:  87.76726253066947 %
92_estimators random forests accuracy:  87.60953382404486 %
93_estimators random forests accuracy:  87.34665264633719 %
94_estimators random forests accuracy:  87.59200841219769 %
95_estimators random forests accuracy:  87.78478794251666 %
96_estimators random forests accuracy:  87.6621100595864 %
97_estimators random forests accuracy:  87.59200841219769 %
98_estimators random forests accuracy:  87.62705923589205 %
99_estimators random forests accuracy:  87.78478794251666 %
100_estimators random forests accuracy:  87.78478794251666 %


# 5. Report Requirements

# 6. Bonus

In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing

In [53]:
le = preprocessing.LabelEncoder()
training_encoded_labels = le.fit_transform(training_labels)
testing_encoded_labels = le.fit_transform(testing_labels)

In [54]:
print(f'Training Data Shape {training_data.shape}')
print(f'Training Label Shape {training_encoded_labels.shape}')

Training Data Shape (13314, 10)
Training Label Shape (13314,)


In [56]:
print(f'Training Data  {training_data[0]}')
print(f'Training Label {training_encoded_labels[0]}')

Training Data  [ 54.2122  14.3806   2.8388   0.3594   0.2239 -49.5198 -37.021    6.2357
  28.641  110.78  ]
Training Label 1


In [261]:
training_data_t = torch.from_numpy(training_data).float()
testing_data_t = torch.from_numpy(testing_data).float()
training_labels_t = torch.from_numpy(training_encoded_labels).float()
testing_labels_t = torch.from_numpy(testing_encoded_labels).float()

In [262]:
print(training_data_t.shape)
print(testing_data_t.shape)

torch.Size([13314, 10])
torch.Size([5706, 10])


In [263]:
print(testing_encoded_labels.shape)
print(testing_encoded_labels.shape)

(5706,)
(5706,)


In [264]:
class ClassificationModel(nn.Module):
    def __init__(self, input_layer, hidden_layer, output_layer):
        super(ClassificationModel, self).__init__()
        self.l1 = nn.Linear(input_layer, hidden_layer)
        self.l2 = nn.Linear(hidden_layer, output_layer)
        self.dropout = nn.Dropout(0.5)
 
    def forward(self, data):
        x = self.l1(data)
        x = torch.relu(x)
        x = self.l2(x)
        return torch.sigmoid(x)
     


In [265]:

# def train_evalute_model(model, epochs, optimizer, loss):
#     epochs_data = []
#     test_acc = 0.0
#     for epoch in range(epochs):
        # # X is a torch Variable
        # permutation = torch.randperm(training_data.size()[0])

        # for i in range(0,training_data.size()[0], batch_size):
            # optimizer.zero_grad()

            # indices = permutation[i:i+batch_size]
            # batch_x, batch_y = training_data[indices], training_encoded_labels[indices]

            # in case you wanted a semi-full example
            # outputs = model.forward(batch_x)
            # loss = loss(outputs, batch_y)

            # loss.backward()
            # optimizer.step()

In [331]:
from sklearn.metrics import accuracy_score

In [376]:

def train_evalute_model(model, epochs, optimizer, loss_fn, training_data_t, testing_data_t, training_labels, testing_labels):
    epochs_data = []
    test_acc = 0.0
    for epoch in range(epochs):
        optimizer.zero_grad()
        model.train()
        y_pred = model(training_data_t)
        loss_train = loss_fn(y_pred, training_labels.reshape(-1, 1))
        loss_train.backward()

        optimizer.step()

        model.eval()
        y_pred_test = model(testing_data_t)
        loss_test = loss_fn(y_pred_test, testing_labels)
        condition = y_pred_test >= 0.5
        y_pred_test = torch.where(condition, 1, 0)
        test_acc = torch.sum(y_pred_test == testing_labels) / len(testing_labels)
        epochs_data.append(test_acc)
        
        if epoch % 100 == 0:
            print(f'EPOCH {epoch} : Test loss {loss_test} --- Test Acc is {test_acc}')
    return epochs_data
            

In [377]:
model = ClassificationModel(10, 64, 1)
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)


In [378]:
acc = train_evalute_model(model, 1500, optimizer, loss_fn, training_data_t, testing_data_t, training_labels_t.reshape(-1, 1), testing_labels_t.reshape(-1, 1))

Test loss 62.90485763549805 --- Test Acc is 0.34121978282928467
Test loss 42.22859573364258 --- Test Acc is 0.3391167223453522
Test loss 1.5627436637878418 --- Test Acc is 0.6154924631118774
Test loss 0.9155380129814148 --- Test Acc is 0.6640378832817078
Test loss 0.6877080202102661 --- Test Acc is 0.7004907131195068
Test loss 0.6104406714439392 --- Test Acc is 0.7267788052558899
Test loss 0.5660296678543091 --- Test Acc is 0.7395724058151245
Test loss 0.5375896692276001 --- Test Acc is 0.7492113709449768
Test loss 0.5193892121315002 --- Test Acc is 0.7562215328216553
Test loss 0.5068076252937317 --- Test Acc is 0.7625306844711304
Test loss 0.4978850483894348 --- Test Acc is 0.7667367458343506
Test loss 0.4915832579135895 --- Test Acc is 0.772870659828186
Test loss 0.4866025447845459 --- Test Acc is 0.7744479775428772
Test loss 0.4824337661266327 --- Test Acc is 0.7744479775428772
Test loss 0.4786672592163086 --- Test Acc is 0.7760252356529236
