In [30]:
import pandas as pd
import time
import pickle
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, confusion_matrix

In [31]:
df = pd.read_csv("dataset/arduino_data.csv")
df

Unnamed: 0,timestamp,temp,humidity,ppm
0,2023-06-09T10:46:48+05:30,38,38,24.01
1,2023-06-09T10:47:49+05:30,38,36,22.39
2,2023-06-09T10:48:49+05:30,38,36,21.62
3,2023-06-09T10:49:50+05:30,38,36,21.62
4,2023-06-09T10:50:50+05:30,38,36,21.62
...,...,...,...,...
10303,2023-06-17T02:51:13+05:30,28,32,25.71
10304,2023-06-17T02:52:13+05:30,28,32,25.71
10305,2023-06-17T02:53:14+05:30,28,32,25.71
10306,2023-06-17T02:54:14+05:30,28,32,28.43


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10308 entries, 0 to 10307
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   timestamp  10308 non-null  object 
 1   temp       10308 non-null  int64  
 2   humidity   10308 non-null  int64  
 3   ppm        10308 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 322.3+ KB


In [33]:
df.describe()

Unnamed: 0,temp,humidity,ppm
count,10308.0,10308.0,10308.0
mean,38.297051,32.617288,39.145906
std,4.053829,5.793688,18.36331
min,28.0,21.0,11.27
25%,35.0,29.0,24.85
50%,39.0,32.0,34.52
75%,41.0,36.0,49.67
max,45.0,55.0,328.6


In [34]:
df.isnull().sum()

timestamp    0
temp         0
humidity     0
ppm          0
dtype: int64

In [35]:
df.duplicated().sum()

0

In [36]:
df_clean = df.drop(columns="timestamp")
df_clean.head()

Unnamed: 0,temp,humidity,ppm
0,38,38,24.01
1,38,36,22.39
2,38,36,21.62
3,38,36,21.62
4,38,36,21.62


## Labeling

In [37]:
def label_quality(row):
   
    ppm_in_range = 0 <= row['ppm'] <= 100
    temp_in_range = 20 <= row['temp'] <= 32
    humidity_in_range = 30 <= row['humidity'] <= 70

    
    in_range_count = sum([ppm_in_range, temp_in_range, humidity_in_range])


    if ppm_in_range and temp_in_range and humidity_in_range:
        return 'healthy'
    elif in_range_count == 2:
        return 'normal'
    else:
        return 'unhealthy'

df_clean['label quality'] = df_clean.apply(label_quality, axis=1)

df_clean.head()


Unnamed: 0,temp,humidity,ppm,label quality
0,38,38,24.01,normal
1,38,36,22.39,normal
2,38,36,21.62,normal
3,38,36,21.62,normal
4,38,36,21.62,normal


In [38]:
df_clean = df_clean.to_csv("dataset/Air Quality Train Data", index=False)

## Training Model 

In [39]:
df_train = pd.read_csv("dataset/Air Quality Train Data.csv")

In [40]:
df_train.head()

Unnamed: 0,temp,humidity,ppm,label quality
0,38,38,24.01,normal
1,38,36,22.39,normal
2,38,36,21.62,normal
3,38,36,21.62,normal
4,38,36,21.62,normal


In [41]:
label_counts = df_train['label quality'].value_counts()
print(label_counts)

label quality
normal       6260
unhealthy    3067
healthy       981
Name: count, dtype: int64


In [42]:
features = ['temp', 'humidity', 'ppm']
target = ['label quality']

x = df_train[features]
y = df_train[target]

In [43]:
x.head()
print(len(x))

10308


In [44]:
y.head()
print(len(y))

10308


In [45]:
def modeling(model, x_train, x_test, y_train, y_test):
    start = time.time()

    
    model.fit(x_train, y_train)
    end = time.time()
    print(f'Training time: {end - start}')
    
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)

    return model, y_pred, y_pred_train

def evaluation(y_actual, y_pred, segment):
    accuracy = accuracy_score(y_actual, y_pred)
    confusion = confusion_matrix(y_actual, y_pred)

    print(f'Evaluation for segment {segment}')
    print('Accuracy : ', accuracy)
    print('Confusion Matrix : \n', confusion)

In [46]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
print("X train shape: ", x_train.shape)
print("X test shape: ", x_test.shape)
print("y train shape: ", y_train.shape)
print("y test shape: ", y_test.shape)

X train shape:  (8246, 3)
X test shape:  (2062, 3)
y train shape:  (8246, 1)
y test shape:  (2062, 1)


## Logistic Regression

In [47]:
lr = LogisticRegression()
model_lr, y_pred_lr, y_pred_train_lr = modeling(lr, x_train, x_test, y_train, y_test)
print('-----------------')
evaluation(y_train, y_pred_train_lr, 'Data Training')
print('-----------------')
evaluation(y_test, y_pred_lr, 'Data Testing')

Training time: 0.16665029525756836
-----------------
Evaluation for segment Data Training
Accuracy :  0.9187484841135096
Confusion Matrix : 
 [[ 638  136    0]
 [ 106 4628  284]
 [   0  144 2310]]
-----------------
Evaluation for segment Data Testing
Accuracy :  0.9161008729388943
Confusion Matrix : 
 [[ 171   36    0]
 [  30 1138   74]
 [   0   33  580]]


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## kNN

In [48]:
knn = KNeighborsClassifier()
model_knn, y_pred_knn, y_pred_train_knn = modeling(knn, x_train, x_test, y_train, y_test)
print('-----------------')
evaluation(y_train, y_pred_train_knn, 'Data Training')
print('-----------------')
evaluation(y_test, y_pred_knn, 'Data Testing')

Training time: 0.016489267349243164


  return self._fit(X, y)


-----------------
Evaluation for segment Data Training
Accuracy :  0.9944215377152559
Confusion Matrix : 
 [[ 766    8    0]
 [   0 5001   17]
 [   0   21 2433]]
-----------------
Evaluation for segment Data Testing
Accuracy :  0.9903006789524733
Confusion Matrix : 
 [[ 204    3    0]
 [   0 1233    9]
 [   0    8  605]]


## Desicion Tree Classifier

In [49]:
dt = DecisionTreeClassifier(random_state=42, max_depth=1)
model_dt, y_pred_dt, y_pred_train_dt = modeling(dt, x_train, x_test, y_train, y_test)
print('-----------------')
evaluation(y_train, y_pred_train_dt, 'Data Training')
print('-----------------')
evaluation(y_test, y_pred_dt, 'Data Testing')

Training time: 0.015348196029663086
-----------------
Evaluation for segment Data Training
Accuracy :  0.8744845985932573
Confusion Matrix : 
 [[   0  774    0]
 [   0 4770  248]
 [   0   13 2441]]
-----------------
Evaluation for segment Data Testing
Accuracy :  0.8685741998060136
Confusion Matrix : 
 [[   0  207    0]
 [   0 1180   62]
 [   0    2  611]]


## Random Forest Regression

In [50]:
rf = RandomForestClassifier()
model_rf, y_pred_rf, y_pred_train_rf = modeling(rf, x_train, x_test, y_train, y_test)
print('-----------------')
evaluation(y_train, y_pred_train_rf, 'Data Training')
print('-----------------')
evaluation(y_test, y_pred_rf, 'Data Testing')

  return fit_method(estimator, *args, **kwargs)


Training time: 0.43584156036376953
-----------------
Evaluation for segment Data Training
Accuracy :  1.0
Confusion Matrix : 
 [[ 774    0    0]
 [   0 5018    0]
 [   0    0 2454]]
-----------------
Evaluation for segment Data Testing
Accuracy :  1.0
Confusion Matrix : 
 [[ 207    0    0]
 [   0 1242    0]
 [   0    0  613]]


## SVM

In [51]:
svm = SVC()
model_svm, y_pred_svm, y_pred_train_svm = modeling(svm, x_train, x_test, y_train, y_test)
print('-----------------')
evaluation(y_train, y_pred_train_svm, 'Data Training')
print('-----------------')
evaluation(y_test, y_pred_svm, 'Data Testing')

  y = column_or_1d(y, warn=True)


Training time: 0.5310244560241699
-----------------
Evaluation for segment Data Training
Accuracy :  0.9700460829493087
Confusion Matrix : 
 [[ 762   12    0]
 [  34 4847  137]
 [   0   64 2390]]
-----------------
Evaluation for segment Data Testing
Accuracy :  0.9709020368574199
Confusion Matrix : 
 [[ 204    3    0]
 [  10 1201   31]
 [   0   16  597]]


## Catboost

In [52]:
cb = CatBoostClassifier(verbose=0)
model_cb, y_pred_cb, y_pred_train_cb = modeling(cb, x_train, x_test, y_train, y_test)
print('-----------------')
evaluation(y_train, y_pred_train_cb, 'Data Training')
print('-----------------')
evaluation(y_test, y_pred_cb, 'Data Testing')

Training time: 4.86887264251709
-----------------
Evaluation for segment Data Training
Accuracy :  1.0
Confusion Matrix : 
 [[ 774    0    0]
 [   0 5018    0]
 [   0    0 2454]]
-----------------
Evaluation for segment Data Testing
Accuracy :  1.0
Confusion Matrix : 
 [[ 207    0    0]
 [   0 1242    0]
 [   0    0  613]]


## Save Model to Pickle

In [53]:
with open('model/model_lr.pkl', 'wb') as file:
    pickle.dump(lr, file)

In [54]:
with open('model/model_knn.pkl', 'wb') as file:
    pickle.dump(knn, file)

In [55]:
with open('model/model_dt.pkl', 'wb') as file:
    pickle.dump(dt, file)

In [56]:
with open('model/model_rf.pkl', 'wb') as file:
    pickle.dump(rf, file)

In [57]:
with open('model/model_svm.pkl', 'wb') as file:
    pickle.dump(svm, file)

In [58]:
with open('model/model_cb.pkl', 'wb') as file:
    pickle.dump(cb, file)