Magic Gamma Telescope Classification Problem

In [17]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [18]:
data=pd.read_csv("magic04.data",delimiter=',',header=None)

In [19]:
data.columns = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'target']
data['target']=data['target'].map({'g':1,'h':0}).values
print(data.head())

    fLength    fWidth   fSize   fConc  fConc1     fAsym  fM3Long  fM3Trans  \
0   28.7967   16.0021  2.6449  0.3918  0.1982   27.7004  22.0110   -8.2027   
1   31.6036   11.7235  2.5185  0.5303  0.3773   26.2722  23.8238   -9.9574   
2  162.0520  136.0310  4.0612  0.0374  0.0187  116.7410 -64.8580  -45.2160   
3   23.8172    9.5728  2.3385  0.6147  0.3922   27.2107  -6.4633   -7.1513   
4   75.1362   30.9205  3.1611  0.3168  0.1832   -5.5277  28.5525   21.8393   

    fAlpha     fDist  target  
0  40.0920   81.8828       1  
1   6.3609  205.2610       1  
2  76.9600  256.7880       1  
3  10.4490  116.7370       1  
4   4.6480  356.4620       1  


In [20]:
class_h=data[data['target']==0]
class_g=data[data['target']==1]
numofh=class_h.shape[0]
class_g_balanced = resample(class_g, replace=False,n_samples=numofh,random_state=42) 
data_balanced=pd.concat([class_h,class_g_balanced])
print(data_balanced['target'].value_counts())

target
0    6688
1    6688
Name: count, dtype: int64


In [21]:
features=data_balanced.drop('target',axis=1)
targets=data_balanced['target']
features_train,features_vald_test,target_train,target_vald_test=train_test_split(features,targets,test_size=0.3)
features_vald,features_test,target_vald,target_test=train_test_split(features_vald_test,target_vald_test,test_size=0.5)
print(f'Training set size: {features_train.shape[0]}')
print(f'Validation set size: {features_vald.shape[0]}')
print(f'Testing set size: {features_test.shape[0]}')

Training set size: 9363
Validation set size: 2006
Testing set size: 2007


In [22]:
scale=StandardScaler()
features_train = scale.fit_transform(features_train)
features_vald = scale.transform(features_vald)
features_test = scale.transform(features_test)

In [23]:
results = []
for k in range(1,21,3):
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(features_train,target_train)
    target_predict=knn.predict(features_vald)
    accuarcy=accuracy_score(target_vald,target_predict)
    class_report = classification_report(target_vald, target_predict,output_dict=True)
    conf_matrix = confusion_matrix(target_vald, target_predict)
    precision = class_report['1']['precision']
    recall = class_report['1']['recall']
    f1_score = class_report['1']['f1-score']
    results.append({
        'k': k,
        'Accuracy': accuarcy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1_score,
        'Confusion Matrix': conf_matrix
    })



In [24]:
results_df = pd.DataFrame(results)
results_df.to_csv('knn_results.csv', index=False)