# Learning from freecodecamp
- Youtube: https://youtu.be/i_LwzRVP7bg?si=o6audQvtlLZ_RLzR
- Dataset: https://archive.ics.uci.edu/dataset/159/magic+gamma+telescope
- Dataset "MAGIC Gamma Telescope" is used from UCI Machine Learning Repository

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [None]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("datasets/magic+gamma+telescope/magic04.data", names=cols)
print (df["class"].unique())

In [None]:
df["class"] = (df["class"] == 'g').astype(int)

In [None]:
df.head()

In [None]:
for label in cols[:-1]:
    plt.hist(df[df["class"]==1][label], color="blue", label="gamma", alpha=0.7, density=True)
    plt.hist(df[df["class"]==0][label], color="red", label="hadron", alpha=0.7, density=True)
    plt.title(label)
    plt.ylabel("Probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()

## Train, Validation, Test Datasets

In [None]:
# frac=1 shuffles the data and 
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))] ) 

In [None]:
def scale_dataset(dataframe, oversample=False):
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X,y)

    data = np.hstack((X, np.reshape(y, (-1, 1) )))

    return data, X, y

In [None]:
print(len(train[train["class"] == 1])) # gamma
print(len(train[train["class"] == 0])) # hadron

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=True)
test, X_test, y_test = scale_dataset(test, oversample=True)

In [None]:
print (sum(y_train == 1))
print (sum(y_train == 0))

## kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [None]:
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

## SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

## Neural Networks

In [None]:
import tensorflow as tf

In [None]:
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4)); # 1 row 2 columns
    ax1.plot(history.history['loss'], label='loss')
    ax1.plot(history.history['val_loss'], label='val_loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Binary crossentropy')
    ax1.grid(True)

    ax2.plot(history.history['accuracy'], label='accuracy')
    ax2.plot(history.history['val_accuracy'], label='val_accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid(True)    

    plt.show()

In [None]:
def train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(10,)),
        # dropout some random elements to avoid overfitting
        tf.keras.layers.Dropout(dropout_prob), 
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(dropout_prob), 
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    # Adam's learning rate is done 0.001 by default
    nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss="binary_crossentropy", 
                     metrics=['accuracy'])

    history = nn_model.fit(
        # leave 20% data as it is for us to test how well it perform
        X_train, y_train, epochs=100, batch_size=32, validation_split=0.2
    )

    return nn_model, history

In [None]:
# hyperparameter tuning by bruteforce (trying all parameter value to get highest accuracy)
least_val_loss = float('inf')
least_loss_model = None
epochs = 100
for num_nodes in [16, 32, 64]:
    for dropout_prob in [0, 0.2]:
        for lr in [0.1,0.005, 0.001]:
            for batch_size in [32, 64, 128]:
                print(f"{num_nodes} nodes, dropout {dropout_prob}, lr {lr}, batch_size {batch_size}")
                model, history = train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
                plot_history(history)
                val_loss = model.evaluate(X_valid, y_valid)
                if val_loss < least_val_loss:
                    least_val_loss = val_loss
                    least_loss_model = model