# scikit-learn classification using kNN and Naive Bayes 

This lesson uses the [MAGIC Gamma telescope data](course_datasets.md#magic-gamma-telecope).  It is a classifcation task.  It first cleans and prepares the data then uses two scikit-learn classifiers, k nearest mneighbours (kNN) and Naive Bayes to predict.

It is based on the tutorial on the YouTube Machine Learning for Everybody channel [here](https://youtu.be/i_LwzRVP7bg?si=if0Cv0izdY4TIcsS).

In [None]:
# !pip  install imbalanced-learn

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler 

In [None]:
col_names = [
    "fLength",
    "fWidth",
    "fSize",
    "fConc",
    "fConc1",
    "fAsym",
    "fM3Long",
    "fM3Trans",
    "fAlpha",
    "fDist",
    "class",
]
df = pd.read_csv("data/magic04.data", header=None, names=col_names)
df.head()

Chack that the label only has two values: g (gamma) and h (hadron)

In [None]:
df["class"].unique()

#Convert the label to an integer: g ->1, h -> 0.
Numeric laels are required by the classifiers.

In [None]:
df["class"] = (df["class"] == "g").astype(int)
# df['class'].unique()
df.head()

For each of the independent variables, show the probability distrution of both gamma (in blue) and hadron (in red)

In [None]:
for col_name in col_names[:-1]:
    plt.hist(
        df[df["class"] == 1][col_name],
        bins=20,
        alpha=0.7,
        label="gamma",
        color="blue",
        density=True,
    )
    plt.hist(
        df[df["class"] == 0][col_name],
        bins=20,
        alpha=0.7,
        label="hadron",
        color="red",
        density=True,
    )
    plt.title(col_name)
    plt.legend()
    plt.ylabel("Probability")
    plt.xlabel(col_name)
    plt.show()

In [13]:
def scale_dataset1(dataframe):
    """Scale the values in each column to a standard normal distribution (mean of 0 and standard distribution of 1)"""
    for col_name in col_names[:-1]:
        dataframe[col_name] = (
            dataframe[col_name] - dataframe[col_name].mean()
        ) / dataframe[col_name].std()
    return dataframe

In [8]:
def scale_dataset_alt(dataframe, oversample=False):
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if oversample:
        ros = RandomOverSampler(random_state=0)
        X, y = ros.fit_resample(X, y)

    data = np.hstack((X, y.reshape(-1, 1)))
    return data, X, y

A quick check to understand exactly what reshape(-1,1) does to an 1D array

In [None]:
test_y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
test_y2 = test_y.reshape(-1, 1)
test_y, test_y.shape, test_y2, test_y2.shape

In [None]:
train, valid, test = np.split(
    df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))]
)

In [12]:
train, X_train, y_train = scale_dataset_alt(train, oversample=True)
valid, X_valid, y_valid = scale_dataset_alt(valid, oversample=False)
test, X_test, y_test = scale_dataset_alt(test, oversample=False)

# KNN
k nearest neighbours

In [13]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))

In [19]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [21]:
y_pred = nb_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))