In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


In [2]:
dt=pd.read_csv('data/Rainfall_data.csv')
dt.head()

Unnamed: 0,Index,Temperature,Humidity%,Rain
0,0,34,74.2,Yes
1,1,19,68.2,No
2,2,28,67.2,Yes
3,3,29,66.6,Yes
4,4,26,57.9,Yes


In [3]:
E=LabelEncoder()
E.fit(['Yes','No'])
dt['Rain']=E.transform(dt['Rain'])
dt.head()

Unnamed: 0,Index,Temperature,Humidity%,Rain
0,0,34,74.2,1
1,1,19,68.2,0
2,2,28,67.2,1
3,3,29,66.6,1
4,4,26,57.9,1


In [4]:
X=dt.drop(columns=['Index','Rain'])
y=dt['Rain']
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.1)

In [5]:
KNN=KNeighborsClassifier()
KNN.fit(X_train,Y_train)

In [6]:
pred_y=KNN.predict(X_test)
acc=accuracy_score(Y_test,pred_y)
print('Accuracy:',acc)

Accuracy: 0.93


#### Performance & Metrics

In [7]:
from sklearn import metrics
cm=metrics.confusion_matrix(Y_test,pred_y)
cm

array([[ 614,   70],
       [  70, 1246]])

In [8]:
cf=pd.DataFrame({'True +ve':cm[:,0],
                 'True -ve':cm[:,1]},
                index=['Predicted +ve',
                       'Predicted -ve'])
cf

Unnamed: 0,True +ve,True -ve
Predicted +ve,614,70
Predicted -ve,70,1246


In [9]:
val=(Y_test,pred_y)
acc=metrics.accuracy_score(*val)
prc=metrics.precision_score(*val)
rcl=metrics.recall_score(*val)
fms=metrics.f1_score(*val)

print('Accuracy:',acc)
print('Precision:',prc)
print('Recal:',rcl)
print('F-Measure:',fms)

Accuracy: 0.93
Precision: 0.9468085106382979
Recal: 0.9468085106382979
F-Measure: 0.9468085106382979


In [10]:
rep=metrics.classification_report(Y_test,pred_y)
print(rep)

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       684
           1       0.95      0.95      0.95      1316

    accuracy                           0.93      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.93      0.93      0.93      2000



The support is the number of Positive values in  the sample for the feature or label here 0 and 1  i.e. Rain or No Rain. Macro average stands for  (macro*score of class 0 + macro*score of class 1  where macro is 0.5 here) and weighted average  stands for (weighted score of class 0 + weighted  score class 1 where the weight is mostly  imbalanced).

### Saving and Loading model

In [11]:
import joblib
joblib.dump(KNN,'RainPred.sav')

['RainPred.sav']

In [12]:
KNN1=joblib.load('RainPred.sav')
KNN1.predict([[34,70]])



array([1])