In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

Dataset: Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

Donated by: P. Savicky Institute of Computer Science, AS of CR Czech Republic savicky '@' cs.cas.cz

In [53]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]

df = pd.read_csv("data/magic04.data", names = cols) # assign headers to cols

df.head() # show first 5 values

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [54]:
df["class"] = (df["class"] == "g").astype(int) # set to 0 or 1
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


In [55]:
"""
for label in cols[:-1]:
  plt.hist(df[df["class"] == 1][label], color='blue', label='gamma', alpha=0.7, density=True) # in the dataframe where everything is == 1, plot them in a histogram from 'label' column
  plt.hist(df[df["class"] == 0][label], color='red', label='gamma', alpha=0.7, density=True) # in the dataframe where everything is == 0, plot them in a histogram from 'label' column
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()
"""

'\nfor label in cols[:-1]:\n  plt.hist(df[df["class"] == 1][label], color=\'blue\', label=\'gamma\', alpha=0.7, density=True) # in the dataframe where everything is == 1, plot them in a histogram from \'label\' column\n  plt.hist(df[df["class"] == 0][label], color=\'red\', label=\'gamma\', alpha=0.7, density=True) # in the dataframe where everything is == 0, plot them in a histogram from \'label\' column\n  plt.title(label)\n  plt.ylabel("Probability")\n  plt.xlabel(label)\n  plt.legend()\n  plt.show()\n'

# Train, Validation, and Test datasets

In [56]:
train, validation, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))]) # split dataset at 60% and 80% (60/20/20)

In [57]:
def scale_dataset(dataframe, oversample = False):
    _x = dataframe[dataframe.columns[:-1]].values # all but last column
    y = dataframe[dataframe.columns[-1]].values # last column

    scaler = StandardScaler()
    x = scaler.fit_transform(_x)

    # there are more gammas than hadrons, called oversampling, so we want to even those out across the dataset
    if oversample:
        ros = RandomOverSampler();
        x, y = ros.fit_resample(x, y)

    data = np.hstack((x, np.reshape(y, (-1, 1)))) # put columns next to each other, convert y to a 2d array -1 (aka len(y)) and itself

    return data, x, y

In [58]:
train, x_train, y_train = scale_dataset(train, oversample = True) # only want oversampling for training data
validation, x_validation, y_validation = scale_dataset(validation, oversample = False)
test, x_test, y_test = scale_dataset(test, oversample = False)

# k-Nearest Neighbors (kNN)

In [59]:
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

In [60]:
knn_model = KNeighborsClassifier(n_neighbors = 5) # in a production environment, you would play with parameters in this function
knn_model.fit(x_train, y_train)

In [61]:
y_prediction = knn_model.predict(x_test)
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.77      0.74      0.76      1380
           1       0.86      0.88      0.87      2424

    accuracy                           0.83      3804
   macro avg       0.82      0.81      0.81      3804
weighted avg       0.83      0.83      0.83      3804



# Naive Bayes

In [62]:
from sklearn.naive_bayes import GaussianNB

In [63]:
nb_model = GaussianNB() # in a production environment, you would play with parameters in this function
nb_model = nb_model.fit(x_train, y_train)

In [64]:
y_prediction = nb_model.predict(x_test)
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.74      0.42      0.53      1380
           1       0.73      0.92      0.81      2424

    accuracy                           0.74      3804
   macro avg       0.74      0.67      0.67      3804
weighted avg       0.74      0.74      0.71      3804



# Logistic Regression

In [65]:
from sklearn.linear_model import LogisticRegression

In [66]:
lg_model = LogisticRegression() # in a production environment, you would play with parameters in this function
lg_model = lg_model.fit(x_train, y_train)

In [67]:
y_prediction = lg_model.predict(x_test)
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.72      0.72      0.72      1380
           1       0.84      0.84      0.84      2424

    accuracy                           0.80      3804
   macro avg       0.78      0.78      0.78      3804
weighted avg       0.80      0.80      0.80      3804



# Support Vector Machines (SVM)

In [68]:
from sklearn.svm import SVC

In [69]:
# takes much longer
svm_model = SVC() # in a production environment, you would play with parameters in this function
svm_model = svm_model.fit(x_train, y_train)

In [70]:
y_prediction = svm_model.predict(x_test)
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.85      0.80      0.82      1380
           1       0.89      0.92      0.90      2424

    accuracy                           0.88      3804
   macro avg       0.87      0.86      0.86      3804
weighted avg       0.88      0.88      0.88      3804



# Neural Network

In [71]:
import tensorflow as tf

In [72]:
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (10, 4))
    ax1.plot(history.history['loss'], label='loss')
    ax1.plot(history.history['val_loss'], label='val_loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Binary Crossentropy')
    ax1.grid(True)
    
    ax2.plot(history.history['accuracy'], label='accuracy')
    ax2.plot(history.history['val_accuracy'], label='val_accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Binary Crossentropy')
    ax2.grid(True)
    
    plt.show()

In [73]:
def train_model(x_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation='relu', input_shape = (10,)),
        tf.keras.layers.Dropout(dropout_prob), # prevents overfitting
        tf.keras.layers.Dense(num_nodes, activation='relu'),
        tf.keras.layers.Dropout(dropout_prob), # prevents overfitting
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])

    nn_model.compile(optimizer = tf.keras.optimizers.Adam(lr), loss = 'binary_crossentropy',
                    metrics = ['accuracy'])
    
    history = nn_model.fit(
        x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split = 0.2, verbose = 0
    )
    
    return nn_model, history

In [None]:
least_val_loss = float('inf')
least_loss_model = None 
epochs = 100

for num_nodes in [16, 32, 64]:
    for dropout_prob in [0, 0.2]:
        for lr in [0.01, 0.005, 0.001]:
            for batch_size in [32, 64, 128]:
                print(f"{num_nodes} nodes, dropout {dropout_prob}, lr {lr}, batch size {batch_size}")
                
                model, history = train_model(x_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
                
                plot_history(history) # loss should decrease, accuracy should increase
                
                val_loss = model.evaluate(x_validation, y_validation, verbose=0)[0]
                
                if val_loss < least_val_loss:
                    least_val_loss = val_loss
                    least_loss_model = model

In [None]:
y_prediction = least_loss_model.predict(x_test)
y_prediction = (y_prediction > 0.5).astype(int).reshape(-1,)

print(classification_report(y_test, y_prediction))