# Preprocessing

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import sklearn
import torch

In [2]:
data = pd.read_csv('./consolidated_data.csv')
data

Unnamed: 0,created_at (SGT),temp,humidity,gas,dampness,bin depth,human count,satisfaction
0,7/10/2023 22:55,30.50,64.00,1796,4095,15.00,2,4
1,7/10/2023 23:05,30.40,64.00,1796,4095,15.00,0,4
2,7/10/2023 23:15,30.40,65.00,1793,4095,15.00,1,4
3,7/10/2023 23:25,30.40,65.00,1780,4095,15.00,0,5
4,7/10/2023 23:35,30.40,65.00,1780,4095,15.00,0,4
...,...,...,...,...,...,...,...,...
701,,31.67,64.52,1825,2182,7.70,5,3
702,,32.97,63.48,1909,2337,11.23,18,2
703,,31.29,68.36,1980,3339,13.07,16,3
704,,28.39,57.56,1851,3011,11.64,5,4


In [3]:
data_X = data[["temp", "humidity", "gas", "dampness", "bin depth", "human count"]]
data_Y = data["satisfaction"]

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(data_X)
X = scaler.transform(data_X)
Y = data_Y

# Gaussian NB

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
clf = GaussianNB()
clf.fit(X_train, Y_train)

pred_train = clf.predict(X_train)
perf_train = np.mean(pred_train == Y_train)
print("The Naive Bayes classifer correctly classified %3.4f%% of the training data irises." % (perf_train * 100.0))

pred_test = clf.predict(X_test)
perf_test = np.mean(pred_test == Y_test)
print("The Naive Bayes classifer correctly classified %3.4f%% of the test data irises." % (perf_test * 100.0))

The Naive Bayes classifer correctly classified 33.6879% of the training data irises.
The Naive Bayes classifer correctly classified 30.9859% of the test data irises.


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
svc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('linearSVC', LinearSVC(max_iter = 100000, loss = 'hinge', penalty = 'l2', dual = 'auto'))])
svc_pipe.fit(X_train, Y_train)

pred_train = svc_pipe.predict(X_train)
perf_train = np.mean(pred_train == Y_train)
print("The Pipeline [StandardScaler, LinearSVC] correctly classified %3.4f%% of the training data irises" % (perf_train * 100.0))

pred_test = svc_pipe.predict(X_test)
perf_test = np.mean(pred_test == Y_test)
print("The Pipeline [StandardScaler, LinearSVC] correctly classified %3.4f%% of the testing data irises" % (perf_test * 100.0))

The Pipeline [StandardScaler, LinearSVC] correctly classified 21.2766% of the training data irises
The Pipeline [StandardScaler, LinearSVC] correctly classified 19.0141% of the testing data irises


In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
C_params = {'C':[1, 10]}
kernel_params = {'kernel':('linear', 'poly', 'rbf', 'sigmoid')}
decision_function_shape_params = {'decision_function_shape':('ovr', 'ovo')}

SVC_pipeline = Pipeline([('scaler', StandardScaler()),
                         ('svc', GridSearchCV(SVC(max_iter = 10000), [C_params, kernel_params, decision_function_shape_params])), ])
SVC_pipeline.fit(X_train, Y_train)

pred_train = SVC_pipeline.predict(X_train)
perf_train = np.mean(pred_train == Y_train)
print("The Pipeline [StandardScaler, GridSearchCV(SVC)] correctly classified %3.4f%% of the training data irises" % (perf_train * 100.0))

pred_test = SVC_pipeline.predict(X_test)
perf_test = np.mean(pred_test == Y_test)
print("The Pipeline [StandardScaler, GridSearchCV(SVC)] correctly classified %3.4f%% of the testing data irises" % (perf_test * 100.0))



The Pipeline [StandardScaler, GridSearchCV(SVC)] correctly classified 64.1844% of the training data irises
The Pipeline [StandardScaler, GridSearchCV(SVC)] correctly classified 35.9155% of the testing data irises


Simple Models is not complex enough

In [8]:
# import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [9]:
class ModelNN(nn.Module):
    def __init__(self):
        super(ModelNN, self).__init__()
        self.l1 = nn.Linear(6, 128)
        self.dropout = nn.Dropout(0.1)
        self.l2 = nn.Linear(128, 1024)
        self.l3 = nn.Linear(1024, 64)
        self.l4 = nn.Linear(64, 6) # 5 classes: from 1-5

    def forward(self, x):
        x = self.l1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.l2(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.l3(x)
        x = F.relu(x)
        x = self.dropout(x)
        output = self.l4(x)
        return output

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
batch_size = 10

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
from torch.utils.data import TensorDataset, DataLoader

training_set = TensorDataset(torch.Tensor(X_train).to(device), torch.tensor(Y_train.values).to(device))
test_set = TensorDataset(torch.Tensor(X_test).to(device), torch.tensor(Y_test.values).to(device))
train_loader = DataLoader(training_set, batch_size = batch_size)
test_loader = DataLoader(test_set, batch_size = batch_size)

In [12]:
model = ModelNN()

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# data.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

num_epochs = 10
for epoch in range(num_epochs):
    for i, (data, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(data)
        loss = criterion(outputs, labels)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i+1) % 50 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

print("Training complete!")

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for rows, labels in train_loader:
        outputs = model(rows)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Accuracy of the model on the {len(X_train)} training data: {100 * correct / total} %')

Epoch [1/10], Step [50/57], Loss: 1.7454
Epoch [2/10], Step [50/57], Loss: 1.7225
Epoch [3/10], Step [50/57], Loss: 1.7086
Epoch [4/10], Step [50/57], Loss: 1.6841
Epoch [5/10], Step [50/57], Loss: 1.6697
Epoch [6/10], Step [50/57], Loss: 1.6710
Epoch [7/10], Step [50/57], Loss: 1.6598
Epoch [8/10], Step [50/57], Loss: 1.6486
Epoch [9/10], Step [50/57], Loss: 1.6438
Epoch [10/10], Step [50/57], Loss: 1.6296
Training complete!
Accuracy of the model on the 564 training data: 32.09219858156028 %


In [13]:
print(model)

ModelNN(
  (l1): Linear(in_features=6, out_features=128, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (l2): Linear(in_features=128, out_features=1024, bias=True)
  (l3): Linear(in_features=1024, out_features=64, bias=True)
  (l4): Linear(in_features=64, out_features=6, bias=True)
)


In [14]:
import shap
explainer = shap.KernelExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


Provided model function fails when applied to the provided data set.


TypeError: linear(): argument 'input' (position 1) must be Tensor, not numpy.ndarray