# Ionosphere- Classification Model

The notebook has following section:

1- Data understanding and exploration

2- Data cleaning

3- Data preparation: Feature Engineering and Scaling

4- Feature Selection using RFE and Model Building


## Data understanding and exploration

In [36]:
#import libraries 

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [37]:
#import data

data=pd.read_csv('Ionosphere.csv')
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V26,V27,V28,V29,V30,V31,V32,V33,V34,Class
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,1
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,0
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,1
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,0
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,1


In [38]:
#pair plot 

#plt.figure(figsize=(10,15))
#sns.pairplot(data)

# Data Cleaning

In [39]:
data.shape

(351, 35)

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 35 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      351 non-null    int64  
 1   V2      351 non-null    int64  
 2   V3      351 non-null    float64
 3   V4      351 non-null    float64
 4   V5      351 non-null    float64
 5   V6      351 non-null    float64
 6   V7      351 non-null    float64
 7   V8      351 non-null    float64
 8   V9      351 non-null    float64
 9   V10     351 non-null    float64
 10  V11     351 non-null    float64
 11  V12     351 non-null    float64
 12  V13     351 non-null    float64
 13  V14     351 non-null    float64
 14  V15     351 non-null    float64
 15  V16     351 non-null    float64
 16  V17     351 non-null    float64
 17  V18     351 non-null    float64
 18  V19     351 non-null    float64
 19  V20     351 non-null    float64
 20  V21     351 non-null    float64
 21  V22     351 non-null    float64
 22  V2

In [41]:
#check null value

data.isnull().sum()

V1       0
V2       0
V3       0
V4       0
V5       0
V6       0
V7       0
V8       0
V9       0
V10      0
V11      0
V12      0
V13      0
V14      0
V15      0
V16      0
V17      0
V18      0
V19      0
V20      0
V21      0
V22      0
V23      0
V24      0
V25      0
V26      0
V27      0
V28      0
V29      0
V30      0
V31      0
V32      0
V33      0
V34      0
Class    0
dtype: int64

In [42]:
#featue variables

X=data.drop('Class',axis=1)

In [81]:
#Target variable

y=data['Class']

In [82]:
#train test data split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=40)

## Data preparation: Feature Engineering and Scaling

### Standardising the data

In [83]:
#Scaling the feature variables

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X_train[X_train.columns]=scaler.fit_transform(X_train[X_train.columns])

### Feature selection: Select K best method

In [84]:
#selecting best 10 features 

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
top_feature= SelectKBest(score_func=chi2,k=10)
fit=top_feature.fit(X_train,pd.DataFrame(y_train))

#Score of top 10 features 

best_feature= pd.DataFrame({"columns":X_train.columns,"scores": fit.scores_})
best_feature=best_feature.nlargest(10,"scores")
best_feature

Unnamed: 0,columns,scores
4,V5,5.916146
0,V1,4.914696
2,V3,4.259537
6,V7,3.227859
30,V31,2.44794
8,V9,1.770512
28,V29,1.602341
32,V33,1.56859
22,V23,1.507081
14,V15,1.362463


In [91]:
#New X_train data set with only selected top 15 variables

X_train=X_train[best_feature['columns']]
X_train.head()

Unnamed: 0,V5,V1,V3,V7,V31,V9,V29,V33,V23,V15
46,1.0,1.0,1.0,0.998005,0.889595,0.99919,0.907165,0.875575,0.93937,0.97644
336,0.835615,1.0,0.9421,0.698065,0.771045,0.51212,0.722845,0.78143,0.363115,0.10671
192,0.5,1.0,0.5,0.5,0.4375,0.5,0.385415,0.40104,0.416665,0.5
343,0.846935,1.0,0.67673,0.840975,0.66116,0.778585,0.67794,0.644715,0.731205,0.796805
69,1.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,1.0


## Model Building

In [92]:
#K-nearest Neighbor fiting the model with number of groups as 2

from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors = 2)
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=2)

In [93]:
#predicting the output for the model

y_pred= knn.predict(X_train)

In [94]:
#confusion Matrics for predicted model

from sklearn import metrics
cnf_mtr=metrics.confusion_matrix(y_train,y_pred)
accuracy=(metrics.accuracy_score(y_train, y_pred))*100
print(cnf_mtr)

[[ 80   0]
 [ 12 153]]


In [97]:
#To check the sensitivity and specificity 
#name true positive, true negative, false positive, false negative for the confusion matrics

TP=cnf_mtr[1,1]
TN=cnf_mtr[0,0]
FP=cnf_mtr[0,1]
FN=cnf_mtr[1,0]

#sensitivity 

Sensitivity = (TP/(TP*FN))*100

#specificity

Specificity = (TN/(TN+FP))*100

#print accuracy score, sensitivity and specificity

print('Accuracy:',accuracy)
print('Sensitivity:', Sensitivity)
print('Specificity:', Sensitivity)

Accuracy: 95.10204081632652
Sensitivity: 8.333333333333332
Specificity: 8.333333333333332


### Fiting the model for test data

In [96]:
#fitting the model for test data

knn=KNeighborsClassifier(n_neighbors = 2)
knn.fit(X_test,y_test)

#Prediction for test set 

y_pred_test= knn.predict(X_test)

#confision matrics and accuracy score for the test data

cnf_mtr_test=metrics.confusion_matrix(y_test,y_pred_test)
accuracy_score_test=metrics.accuracy_score(y_test, y_pred_test)*100

#To check the sensitivity and specificity for test data set
#name true positive, true negative, false positive, false negative for the confusion matrics

TP=cnf_mtr[1,1]
TN=cnf_mtr[0,0]
FP=cnf_mtr[0,1]
FN=cnf_mtr[1,0]

#sensitivity for test data set

Sensitivity_test = (TP/(TP*FN))*100

#specificity for test data set

Specificity_test = (TN/(TN+FP))*100

#print accuracy score, sensitivity and specificity

print('Accuracy:',accuracy_score_test)
print('Sensitivity:', Sensitivity_test)
print('Specificity:', Specificity_test)

Accuracy: 97.16981132075472
Sensitivity: 8.333333333333332
Specificity: 100.0
