# Data Classification
The MAGIC gamma telescope dataset:
https://archive.ics.uci.edu/ml/datasets/MAGIC+Gamma+Telescope, it generated to simulate
registration of high energy gamma particles in a ground-based atmospheric Cherenkov gamma
telescope using the imaging technique allowing to discriminate statistically the information
caused by primary gammas (signal) from the images of hadronic showers
initiated by cosmic rays in the upper atmosphere (background).
It is required to investigate the data deeper, split into train and test data with class labels
g = gamma (signal) and h = hadron (background). You are asked to apply preprocessing and feature
selection techniques and construct classification models using different approaches such as Decision
Trees, AdaBoost, K-Nearest Neighbor (K-NN) and Logistic Regression and compare the results
between them and between with and without applying preprocessing and feature selection. Moreover,
you should evaluate and test the classification models accuracy.

In [255]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

1. fLength: continuous # major axis of ellipse [mm] 
2. fWidth: continuous # minor axis of ellipse [mm] 
3. fSize: continuous # 10-log of sum of content of all pixels [in #phot] 
4. fConc: continuous # ratio of sum of two highest pixels over fSize [ratio] 
5. fConc1: continuous # ratio of highest pixel over fSize [ratio] 
6. fAsym: continuous # distance from highest pixel to center, projected onto major axis [mm] 
7. fM3Long: continuous # 3rd root of third moment along major axis [mm] 
8. fM3Trans: continuous # 3rd root of third moment along minor axis [mm] 
9. fAlpha: continuous # angle of major axis with vector to origin [deg] 
10. fDist: continuous # distance from origin to center of ellipse [mm] 
11. class: g,h # gamma (signal), hadron (background) 

g = gamma (signal): 12332 
h = hadron (background): 6688 


In [256]:
col_names = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym',  'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
feature_names = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym',  'fM3Long', 'fM3Trans', 'fAlpha', 'fDist']
data = pd.read_csv("magic04.data", names=col_names)
X = data[feature_names]
Y = data['class']
data.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


## Without Preprocessing or feature selection

In [244]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7)
X_train.head()
# print(X.shape)

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist
16380,76.2773,28.4018,2.888,0.4118,0.2128,68.0222,56.0183,24.7618,11.8216,303.9442
8647,27.5042,17.6334,2.4541,0.3445,0.174,-6.0405,7.7084,-16.0424,31.28,159.278
3531,26.1702,17.3918,2.6299,0.3775,0.2145,3.3366,23.0898,-16.6961,73.17,43.7272
3703,51.3155,11.598,2.4166,0.4713,0.251,-70.1741,20.7189,-11.2107,8.8813,271.059
5493,67.8749,23.5976,3.4861,0.2237,0.1135,54.1529,68.0549,-12.3682,0.505,174.157


In [246]:
model = GaussianNB()
y_pred = model.fit(X_train, Y_train).predict(X_train)

print("Number of mislabeled points %d out of %d total points."% ((Y_train != y_pred).sum(), X_train.shape[0]))

probs = model.predict_proba(X_test)
print('For model', 'accuracy =', model.score(X_test,Y_test))

# plt.plot(fpr, tpr, label='%s (area = %0.2f)' % (name, roc_auc))
precision_recall_fscore_support(Y_train, y_pred)
# plt.plot(fpr, tpr, label='%s (area = %0.2f)' % (name, roc_auc))
# print(X.shape)

Number of mislabeled points 3647 out of 13314 total points.
For model accuracy = 0.7295828951980372


(array([0.72845784, 0.71577262]),
 array([0.91733613, 0.3784127 ]),
 array([0.81205875, 0.49508514]),
 array([8589, 4725]))

## With Preprocessing and feature selection

In [249]:
def best_k(X):
    err = X.shape[0]
    best_val = 2
    
    for i in range(1, X.shape[1]):
    
        X_new = SelectKBest(f_classif, k=i).fit_transform(X, Y)
        model = GaussianNB()
        y_pred = model.fit(X_new, Y).predict(X_new)
        num = (Y != y_pred).sum()
        if num < err :
            err = num
            best_val = i
    
    return best_val

In [253]:
# print(X.shape)
X = preprocessing.StandardScaler().fit_transform(X)
K = best_k(X)
# print(K)

X_new = SelectKBest(f_classif, k=K).fit_transform(X, Y)
X_train, X_test, Y_train, Y_test = train_test_split(X_new, Y, train_size=0.7)

model = GaussianNB()
model.fit(X_train, Y_train)
y_pred = model.predict(X_train)
print("Number of mislabeled points %d out of %d total points."% ((Y_train != y_pred).sum(), X_train.shape[0]))

probs = model.predict_proba(X_test)
print('For model', 'accuracy =', model.score(X_test,Y_test))

# # # plt.plot(fpr, tpr, label='%s (area = %0.2f)' % (name, roc_auc))
precision_recall_fscore_support(Y_train, y_pred)

Number of mislabeled points 3102 out of 13314 total points.
For model accuracy = 0.7600771118121276


(array([0.78052955, 0.72729962]),
 array([0.89371758, 0.53007114]),
 array([0.83329751, 0.61321696]),
 array([8675, 4639]))