Magic Gamma Telescope Classification Problem

In [3]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
data=pd.read_csv("magic04.data",delimiter=',',header=None)

In [4]:
data.columns=[f'feature{i}' for i in range(1,11)] + ['target']
data['target']=data['target'].map({'g':1,'h':0}).values
print(data.head())

   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0   28.7967   16.0021    2.6449    0.3918    0.1982   27.7004   22.0110   
1   31.6036   11.7235    2.5185    0.5303    0.3773   26.2722   23.8238   
2  162.0520  136.0310    4.0612    0.0374    0.0187  116.7410  -64.8580   
3   23.8172    9.5728    2.3385    0.6147    0.3922   27.2107   -6.4633   
4   75.1362   30.9205    3.1611    0.3168    0.1832   -5.5277   28.5525   

   feature8  feature9  feature10  target  
0   -8.2027   40.0920    81.8828       1  
1   -9.9574    6.3609   205.2610       1  
2  -45.2160   76.9600   256.7880       1  
3   -7.1513   10.4490   116.7370       1  
4   21.8393    4.6480   356.4620       1  


In [5]:
class_h=data[data['target']==0]
class_g=data[data['target']==1]
numofh=class_h.shape[0]
class_g_balanced = resample(class_g, replace=False,n_samples=numofh,random_state=42) 
data_balanced=pd.concat([class_h,class_g_balanced])
print(data_balanced['target'].value_counts())

target
0    6688
1    6688
Name: count, dtype: int64


In [6]:
features=data_balanced.drop('target',axis=1)
targets=data_balanced['target']
features_train,features_vald_test,target_train,target_vald_test=train_test_split(features,targets,test_size=0.3)
features_vald,features_test,target_vald,target_test=train_test_split(features_vald_test,target_vald_test,test_size=0.5)
print(f'Training set size: {features_train.shape[0]}')
print(f'Validation set size: {features_vald.shape[0]}')
print(f'Testing set size: {features_test.shape[0]}')

Training set size: 9363
Validation set size: 2006
Testing set size: 2007


In [7]:
scale=StandardScaler()
features_train = scale.fit_transform(features_train)
features_vald = scale.transform(features_vald)
features_test = scale.transform(features_test)

In [8]:
for k in range(1,21,3):
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(features_train,target_train)
    target_predict=knn.predict(features_vald)
    score=accuracy_score(target_vald,target_predict)
    print(f'prediction:{target_predict}')
    print(f'actual value:{target_vald}')
    print(f'accuarcy for k={k} on validation set: {score:.4f}')

prediction:[0 1 1 ... 1 1 1]
actual value:16088    0
4950     1
1617     1
17036    0
15518    0
        ..
12469    0
15673    0
792      1
7503     1
5812     1
Name: target, Length: 2006, dtype: int64
accuarcy for k=1 on validation set: 0.7861
prediction:[0 0 1 ... 1 1 1]
actual value:16088    0
4950     1
1617     1
17036    0
15518    0
        ..
12469    0
15673    0
792      1
7503     1
5812     1
Name: target, Length: 2006, dtype: int64
accuarcy for k=4 on validation set: 0.7866
prediction:[0 1 1 ... 1 0 1]
actual value:16088    0
4950     1
1617     1
17036    0
15518    0
        ..
12469    0
15673    0
792      1
7503     1
5812     1
Name: target, Length: 2006, dtype: int64
accuarcy for k=7 on validation set: 0.8136
prediction:[0 1 1 ... 1 0 1]
actual value:16088    0
4950     1
1617     1
17036    0
15518    0
        ..
12469    0
15673    0
792      1
7503     1
5812     1
Name: target, Length: 2006, dtype: int64
accuarcy for k=10 on validation set: 0.8036
prediction: