In [232]:
#load class
import numpy as np
import pandas as pd # in case it's not installed then install using conda create -c conda-forge -n name_of_my_env python pandas
# more installations information can be found: https://pandas.pydata.org/docs/getting_started/install.html
import os
#load sklearn module for creating and evaluating ML models. In case sklearn isn't installed,
#follow this link https://scikit-learn.org/stable/install.html for further details on the installation process.
from sklearn.neighbors import KNeighborsClassifier #load your classifier. In this code, I have used KNN. You can choose other algorithms. You have to use at least 3 to complete this.
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler #module for perform scaling
from sklearn.model_selection import train_test_split #module for splitting datatset
from sklearn import metrics #module for evaluating performance

In [233]:
#load your data
file_path = "data.csv"
df = pd.read_csv(file_path) #change the name accordingly
df.head() # prints top 5 rows from the datatset to check data is load or not

Unnamed: 0,Temp,Humd,Label
0,24.0,23.0,1
1,24.0,23.0,1
2,24.0,23.0,1
3,24.0,23.0,1
4,24.0,23.0,1


In [234]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 Temp     0
Humd     0
Label    0
dtype: int64


In [235]:
# remove duplicates
df = df.drop_duplicates()

In [236]:
# prepare features
x = df.drop(['Label'],axis=1) #remove class or label
y = df['Label'] #load label

In [237]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2) #split datatset. Here ratio is 80:20. Change accordingly

In [238]:
# Scale the data using standardization
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train) #scale training set
x_test = scaler.transform(x_test) #scale test set

In [239]:
x_train.view()

array([[-0.84735414, -1.05173237],
       [ 1.21812448, -0.66788844],
       [ 0.54091838,  1.23597745],
       [-0.87444238, -1.03637861],
       [-1.01665567,  0.19192196],
       [ 1.72602906,  1.1131474 ],
       [ 0.10073441,  1.23597745],
       [-0.84735414, -1.02102485]])

In [240]:
y_train.view()

996     1
1001    0
1000    0
997     1
1005    0
1002    0
1003    0
998     1
Name: Label, dtype: int64

In [241]:
np.unique(y_train)

array([0, 1], dtype=int64)

### First, we wil choose the `KNeighborsClassifier` model

In [242]:
z_KNN = KNeighborsClassifier(n_neighbors=3) # KNN classifier for 3 neighbours
KNN = z_KNN.fit(x_train,y_train) # start training

In [243]:
predict_KNN = KNN.predict(x_test) # performance in the test set

In [244]:
print("Accuracy:", metrics.accuracy_score(y_test,predict_KNN)) # evaluating the performance based on accuracy

Accuracy: 0.6666666666666666


### Next, we will use the `MLPClassifier` model and check it's accuracy.

In [245]:
z_MLPC = MLPClassifier(solver='lbfgs', alpha=1e-5,
                  hidden_layer_sizes=(5, 2), random_state=1)
MLPC = z_MLPC.fit(x_train, y_train)

In [246]:
predict_class = MLPC.predict(x_test)

In [247]:
print("Accuracy:", metrics.accuracy_score(y_test,predict_class))

Accuracy: 1.0


### Finally, we will be using the `DecisionTreeClassifier` model for our last model to train

In [248]:
z_DTC = DecisionTreeClassifier()
DTC = z_DTC.fit(x_train, y_train)

In [249]:
predict_tree = DTC.predict(x_test)

In [250]:
print("Accuracy:", metrics.accuracy_score(y_test,predict_tree))

Accuracy: 1.0


Conclusion: We found that the two algorithms we chose successfuly classifies between true and false data 100% of the time, while the Nearest Neighbors model.

### After finding a good model, we need to be able to store the trained model for future use. We chose to save the Decision Tree model:

In [252]:
import pickle

filename = "model.pickle"

pickle.dump(z_DTC, open(filename, "wb"))