In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler

### DataSet: Magic Gamma Telescope

Dua, D. and Graff, C. (2019), UCI Machine Learning Repository ['http://archive.ics.uci.edu/ml]. Irvine, CA: Univeristy of California, School of Information and Computer Sciences.

Donated By: P.Savicky Institute of Computer Science, AS of CR Czech Republic savicky '@' cs.cas.cz

General Setup Cleaning and Transforming Data

In [None]:
cols =["fLength","fWidth","fSize","fConc","fConc1","fAsym","fm3Long", "fM3Trans","fAlpha","fDist","class"]
df = pd.read_csv("DataSets/magicGammaTelescope/magic04.data", names=cols)
df.head()

Changing Class Gamma/Hadron to Numerical Data

In [None]:
df["class"] = (df["class"] == "g").astype(int)
df.head()

For Each Label Name [:-1]: Compare Each Attribute Grouping By 'Class' 

In [None]:

for label in cols[:-1]:

    # Make a histogram of class; Gamma = 1, Hadron = 0, Generic Alpha, and Density Even Dist
    plt.hist(df[df["class"]==1][label], color='blue', label='Gamma', alpha=0.7, density=True)
    plt.hist(df[df["class"]==0][label], color='red', label='Hadron', alpha=0.7, density=True)
    plt.title(label)
    plt.ylabel('Probability')
    plt.xlabel(label)
    plt.legend()
    plt.show()

    print("Label: " + label)

    

### Setup Vars for Train,Validate,Test
1st Seg: 0 -> 60% - Training Data <br>
2nd Seg: 60% -> 80% - Validation Data <br>
3rd Seg: 80 -> 100% - Testing Data <br>

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

### Data of Different Scale.
EX: col1HighestValue === 5 vs. col2HighestValue === 5000000 <br>
We create a method to scale the data and shape any incorrectly dimensioned dataframes

In [None]:
def scale_dataset(dataframe, oversample=False):

    # get all values assuming label is last
    X = dataframe[dataframe.columns[:-1]].values
    # get labels
    y = dataframe[dataframe.columns[-1]].values

    # create imported scaler
    scaler = StandardScaler()
    # fit and transform to make x fit
    X = scaler.fit_transform(X)

    # If there is an oversample use package randomoversampler, and refit
    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X,y)

    # into a new data frame put reshaped data (data reshaping due to X is 2d, and y is a 1d vector) and horizontally stack them. side by side
    data = np.hstack((X, np.reshape(y, (-1,1))))

    return data, X, y

Scale Dataset, Training, Validtion, and Testing; Ready For Simple Built-In Scikit-Learn Models

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

### Model 1: K-Nearest Neighbors (K-NN)

In [None]:
# Created a model, parem - amount of nearest neighbors
knn_model = KNeighborsClassifier(n_neighbors=2)

# fit using X_train, and y_train data
knn_model = knn_model.fit(X_train, y_train)


### Classification Report (K-NearestNeighbors)

In [None]:
# PREDICTION - First Prediction
y_pred = knn_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Precision - How many of our labeling are confirmed to be True Positives
# 1st Test .77, .84; 1-n_neighbors
# 2nd Test .67, .87; 2-n_neighbors

# Recall - Of all that are truly positive how many do we actually get correct.
# 1st Test .68, .89; 1-n_neighbors
# 2nd Test .78, .80; 2-n_neighbors

# Accuracy - If we look at pure correct vs total.
# 1st Test .82; 1-n_neighbors
# 2nd Test .79; 2-n_neighbors

### Model 2: Naive-Bayes

In [None]:
# Naive-Bayes Gaussian Model; Named after the Mathematician as we do.
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

### Classification Report (Gaussian Naive-Bayes)

In [None]:
# PREDICTION - Notice Everything is variating from our previous method.
y_pred = nb_model.predict(X_test)
print(classification_report(y_test,y_pred))

### Model 3: Logistic Regression

In [None]:
# Logistic Regression Model; Multiple Logistic Regression 
# After taking a logistic regression and converting to a Sigmoid Function to keep within the realms of probability (0 -> 1).
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

### Classification Report (Logistic Regression)

In [None]:
# PREDICTION - Notice Everything is variating from our previous method. little different. Result may vary
y_pred = lr_model.predict(X_test)
print(classification_report(y_test,y_pred))

### Model 3: Support Vector Machine

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

### Classification Report (Support Vector Machine)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

### Can we beat a common result, ~80%? Lets try Neural Networks

Take all of the features and turn them into inputs by giving them a weight value. <br>
Multiply the features and Sum "Insert Epsilon Here" and this is the Neuron. <br>
Add bias if needed. <br>
Then after all this comes together it is processed via ActivationFunction to get out output. <br><br>

*Note* without the ActivationFunction. This is a LinearModel.<br>
Examples include Sigmoid Functions (val from 0 -> 1), Tanh (val from -1 -> 1), and RELU (any val<0 = 0 -> val=Linear)<br><br>

Features x0, x1, x2<br>
Weighting w0, w1, w2<br>

[x0] *= w0<br>
[x1] *= w1<br>
[x2] *= w2<br>

[xT] = [x0] + [x1] + [x2] <br>

[xT] += BIAS -> AF() -> *** OUTPUT *** <br>

### Let's Go! Tensor Flow!

In [None]:
# Nueral net model shouold be layered that I pass in as a model
# Using a dense layer of 32 units
def train_model(X_train, y_train, numNodes, dropoutProb, lr, batch_size, epochs):
    nn_model = tf.keras.Sequential([

        tf.keras.layers.Flatten(input_shape=(10, )),

        # layer of shape; density is numbe of nodes per layer
        tf.keras.layers.Dense(numNodes, activation='relu'),

        # Helps prevent overfitting by selectively leaving out certain samples
        tf.keras.layers.Dropout(dropoutProb),
        
        #second layer of no shape
        tf.keras.layers.Dense(numNodes, activation='relu'),

        # Helps prevent overfitting by selectively leaving out certain samples
        tf.keras.layers.Dropout(dropoutProb),

        
        # third layer of sigmoid (project our predictions to be 0 or 1)
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])

    nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='binary_crossentropy',
                    metrics=['accuracy'])
    
    # Each epoch leave x% out and see how the model performs on the % segment taken out of the training
    # this is validation split
    history = nn_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

    return nn_model, history

In [None]:
# Lets plot
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
    ax1.plot(history.history['loss'], label='loss')
    ax1.plot(history.history['val_loss'], label='val_loss')
    ax1.set_xlabel("Epoch")
    ax1.set_ylabel('Binary Crossentropy')
    ax1.legend()
    ax1.grid(True)
    
    ax2.plot(history.history['accuracy'], label='accuracy')
    ax2.plot(history.history['val_accuracy'], label='val_accuracy')
    ax2.set_xlabel("Epoch")
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid(True)

    plt.show()

Display Tensor Flow Through PyPlot

### Neural Network Result

In [None]:
least_val_loss = float('inf')
least_loss_model = None
epochs=100
for num_nodes in [16, 32, 64]:
    for dropout_prob in [0, 0.2]:
        for lr in [0.1, 0.005, 0.001]:
            for batch_size in [32, 64, 128]:
                print(f"nodes {num_nodes}, dropout {dropout_prob}, lr {lr}, batch size {batch_size}")
                model, history = train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
                plot_history(history)
                val_loss = model.evaluate(X_valid, y_valid)[0]
                if val_loss < least_val_loss:
                    least_val_loss = val_loss
                    least_loss_model = model