In [1]:
#Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import datetime as dt


Loading dataset

# Data Descripton
Data Set Information:

The experimental testbed for occupancy estimation was deployed in a 6m Ã— 4.6m room. The setup consisted of 7 sensor nodes and one edge node in a star configuration with the sensor nodes transmitting data to the edge every 30s using wireless transceivers. No HVAC systems were in use while the dataset was being collected.

Five different types of non-intrusive sensors were used in this experiment: temperature, light, sound, CO2 and digital passive infrared (PIR). The CO2, sound and PIR sensors needed manual calibration. For the CO2 sensor, zero-point calibration was manually done before its first use by keeping it in a clean environment for over 20 minutes and then pulling the calibration pin (HD pin) low for over 7s. The sound sensor is essentially a microphone with a variable-gain analog amplifier attached to it. Therefore, the output of this sensor is analog which is read by the microcontrollerâ€™s ADC in volts. The potentiometer tied to the gain of the amplifier was adjusted to ensure the highest sensitivity. The PIR sensor has two trimpots: one to tweak the sensitivity and the other to tweak the time for which the output stays high after detecting motion. Both of these were adjusted to the highest values. Sensor nodes S1-S4 consisted of temperature, light and sound sensors, S5 had a CO2 sensor and S6 and S7 had one PIR sensor each that were deployed on the ceiling ledges at an angle that maximized the sensorâ€™s field of view for motion detection.

The data was collected for a period of 4 days in a controlled manner with the occupancy in the room varying between 0 and 3 people. The ground truth of the occupancy count in the room was noted manually.

Please refer to our publications for more details.



Attribute Information:

Date: YYYY/MM/DD
Time: HH:MM:SS
Temperature: In degree Celsius
Light: In Lux
Sound: In Volts (amplifier output read by ADC)
CO2: In PPM
CO2 Slope: Slope of CO2 values taken in a sliding window
PIR: Binary value conveying motion detection
Room_Occupancy_Count: Ground Truth

In [2]:
# Data is taken from https://archive.ics.uci.edu/ml/datasets/Room+Occupancy+Estimation
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00640/Occupancy_Estimation.csv')

In [3]:
data.columns = data.columns.str.lower()

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10129 entries, 0 to 10128
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  10129 non-null  object 
 1   time                  10129 non-null  object 
 2   s1_temp               10129 non-null  float64
 3   s2_temp               10129 non-null  float64
 4   s3_temp               10129 non-null  float64
 5   s4_temp               10129 non-null  float64
 6   s1_light              10129 non-null  int64  
 7   s2_light              10129 non-null  int64  
 8   s3_light              10129 non-null  int64  
 9   s4_light              10129 non-null  int64  
 10  s1_sound              10129 non-null  float64
 11  s2_sound              10129 non-null  float64
 12  s3_sound              10129 non-null  float64
 13  s4_sound              10129 non-null  float64
 14  s5_co2                10129 non-null  int64  
 15  s5_co2_slope       

#### Converting Date and Time column format

In [5]:
data.date = data.date.astype('datetime64')
data.time = data.time.astype('datetime64')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10129 entries, 0 to 10128
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   date                  10129 non-null  datetime64[ns]
 1   time                  10129 non-null  datetime64[ns]
 2   s1_temp               10129 non-null  float64       
 3   s2_temp               10129 non-null  float64       
 4   s3_temp               10129 non-null  float64       
 5   s4_temp               10129 non-null  float64       
 6   s1_light              10129 non-null  int64         
 7   s2_light              10129 non-null  int64         
 8   s3_light              10129 non-null  int64         
 9   s4_light              10129 non-null  int64         
 10  s1_sound              10129 non-null  float64       
 11  s2_sound              10129 non-null  float64       
 12  s3_sound              10129 non-null  float64       
 13  s4_sound        

#### Adding Hour and Minute as features

In [7]:
data.insert(loc=2, column='hour', value=data.time.dt.hour)

In [8]:
data.insert(loc=3, column='minute', value=data.time.dt.minute)

In [9]:
data

Unnamed: 0,date,time,hour,minute,s1_temp,s2_temp,s3_temp,s4_temp,s1_light,s2_light,...,s4_light,s1_sound,s2_sound,s3_sound,s4_sound,s5_co2,s5_co2_slope,s6_pir,s7_pir,room_occupancy_count
0,2017-12-22,2023-01-27 10:49:41,10,49,24.94,24.75,24.56,25.38,121,34,...,40,0.08,0.19,0.06,0.06,390,0.769231,0,0,1
1,2017-12-22,2023-01-27 10:50:12,10,50,24.94,24.75,24.56,25.44,121,33,...,40,0.93,0.05,0.06,0.06,390,0.646154,0,0,1
2,2017-12-22,2023-01-27 10:50:42,10,50,25.00,24.75,24.50,25.44,121,34,...,40,0.43,0.11,0.08,0.06,390,0.519231,0,0,1
3,2017-12-22,2023-01-27 10:51:13,10,51,25.00,24.75,24.56,25.44,121,34,...,40,0.41,0.10,0.10,0.09,390,0.388462,0,0,1
4,2017-12-22,2023-01-27 10:51:44,10,51,25.00,24.75,24.56,25.44,121,34,...,40,0.18,0.06,0.06,0.06,390,0.253846,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10124,2018-01-11,2023-01-27 08:58:07,8,58,25.06,25.13,24.69,25.31,6,7,...,22,0.09,0.04,0.06,0.08,345,0.000000,0,0,0
10125,2018-01-11,2023-01-27 08:58:37,8,58,25.06,25.06,24.69,25.25,6,7,...,22,0.07,0.05,0.05,0.08,345,0.000000,0,0,0
10126,2018-01-11,2023-01-27 08:59:08,8,59,25.13,25.06,24.69,25.25,6,7,...,22,0.11,0.05,0.06,0.08,345,0.000000,0,0,0
10127,2018-01-11,2023-01-27 08:59:39,8,59,25.13,25.06,24.69,25.25,6,7,...,22,0.08,0.08,0.10,0.08,345,0.000000,0,0,0


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [11]:
#X = data.iloc[:,2:-1]
X = data[]


y = data.iloc[:,-1]

In [12]:
X.shape

(10129, 18)

In [13]:
y.shape

(10129,)

In [14]:
X

Unnamed: 0,hour,minute,s1_temp,s2_temp,s3_temp,s4_temp,s1_light,s2_light,s3_light,s4_light,s1_sound,s2_sound,s3_sound,s4_sound,s5_co2,s5_co2_slope,s6_pir,s7_pir
0,10,49,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0
1,10,50,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0
2,10,50,25.00,24.75,24.50,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0
3,10,51,25.00,24.75,24.56,25.44,121,34,53,40,0.41,0.10,0.10,0.09,390,0.388462,0,0
4,10,51,25.00,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10124,8,58,25.06,25.13,24.69,25.31,6,7,33,22,0.09,0.04,0.06,0.08,345,0.000000,0,0
10125,8,58,25.06,25.06,24.69,25.25,6,7,34,22,0.07,0.05,0.05,0.08,345,0.000000,0,0
10126,8,59,25.13,25.06,24.69,25.25,6,7,34,22,0.11,0.05,0.06,0.08,345,0.000000,0,0
10127,8,59,25.13,25.06,24.69,25.25,6,7,34,22,0.08,0.08,0.10,0.08,345,0.000000,0,0


In [15]:
y

0        1
1        1
2        1
3        1
4        1
        ..
10124    0
10125    0
10126    0
10127    0
10128    0
Name: room_occupancy_count, Length: 10129, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [17]:
y_train

8983    0
1142    0
3680    2
9889    0
8334    2
       ..
9225    0
4859    0
3264    1
9845    0
2732    0
Name: room_occupancy_count, Length: 7090, dtype: int64

In [18]:
y_test

8549    0
6003    0
3449    2
1666    0
8833    0
       ..
4347    0
785     3
7709    0
8645    0
8193    3
Name: room_occupancy_count, Length: 3039, dtype: int64

In [19]:
X_train

Unnamed: 0,hour,minute,s1_temp,s2_temp,s3_temp,s4_temp,s1_light,s2_light,s3_light,s4_light,s1_sound,s2_sound,s3_sound,s4_sound,s5_co2,s5_co2_slope,s6_pir,s7_pir
8983,23,9,25.25,25.31,24.94,25.44,0,0,0,0,0.07,0.04,0.06,0.09,345,0.000000,0,0
1142,21,15,25.69,25.75,25.38,25.94,0,0,0,0,0.06,0.06,0.07,0.07,675,-2.153846,0,0
3680,19,10,26.25,26.19,26.00,26.38,114,19,180,7,0.08,0.05,0.10,0.07,1040,-0.446154,0,0
9889,6,56,25.06,25.06,24.56,25.25,0,0,1,0,0.08,0.04,0.05,0.09,345,0.000000,0,0
8334,17,34,25.94,26.19,25.88,25.94,146,231,170,19,0.48,0.60,0.71,0.17,835,1.419231,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9225,1,14,25.19,25.25,24.75,25.38,0,0,0,0,0.07,0.05,0.06,0.08,345,0.000000,0,0
4859,5,20,25.19,25.19,24.63,25.56,0,0,0,0,0.07,0.06,0.06,0.06,360,0.000000,0,0
3264,15,33,25.88,26.81,25.50,26.25,120,26,48,35,0.22,0.08,0.07,0.06,420,-1.623077,0,0
9845,6,34,25.06,25.06,24.56,25.19,0,0,0,0,0.08,0.05,0.05,0.08,345,0.000000,0,0


In [20]:
X_test

Unnamed: 0,hour,minute,s1_temp,s2_temp,s3_temp,s4_temp,s1_light,s2_light,s3_light,s4_light,s1_sound,s2_sound,s3_sound,s4_sound,s5_co2,s5_co2_slope,s6_pir,s7_pir
8549,19,25,25.56,25.63,25.25,25.69,0,0,0,0,0.07,0.06,0.06,0.09,535,-2.530769,0,0
6003,15,13,25.44,25.44,25.38,26.19,12,15,62,45,0.08,0.05,0.06,0.09,355,0.000000,0,0
3449,17,10,26.19,25.88,25.69,26.31,117,23,189,12,0.58,0.41,1.56,0.50,680,2.757692,0,1
1666,1,46,25.31,25.31,24.81,25.69,0,0,0,0,0.07,0.05,0.07,0.06,370,0.000000,0,0
8833,21,52,25.38,25.38,25.00,25.50,0,0,0,0,0.08,0.05,0.05,0.08,360,-0.253846,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4347,0,56,25.44,25.44,24.94,25.69,0,0,0,0,0.06,0.04,0.06,0.06,365,-0.165385,0,0
785,18,4,26.31,26.31,25.94,26.38,148,234,178,10,0.61,0.19,0.16,0.11,970,4.173077,1,0
7709,5,56,25.19,25.19,24.69,25.50,0,0,0,0,0.07,0.05,0.06,0.09,355,0.000000,0,0
8645,20,15,25.50,25.50,25.13,25.63,0,0,0,0,0.08,0.05,0.06,0.09,390,-1.707692,0,0


In [21]:
X_train.shape

(7090, 18)

In [22]:
X_test.shape

(3039, 18)

# 1- Logostic Regression

In [23]:
#Regularization
# Dummies
# scaler
classifier_lreg = Pipeline([('scalre', StandardScaler())   #First: StandardScaler
                       #,('',),                         #Second:dummies
                       ,('lreg', LogisticRegression(max_iter=10000))])

In [24]:
from sklearn.model_selection import cross_val_score, cross_validate

### Cross Validation on Logistic regression

In [27]:
accuracies = cross_val_score(classifier_lreg, X = X_train, y=y_train, cv=5)

In [30]:
accuracies

array([0.98589563, 0.99365303, 0.99788434, 0.99294781, 0.99365303])

In [29]:
accuracies.mean()

0.9928067700987306

#### Now we are sure that our accuracy was not based on chance, therfore we trian the model again but this on whole train data.

In [None]:
classifier_lreg.fit(X_train,y_train)

In [None]:
y_lreg_pred = classifier_lreg.predict(X_test)

In [None]:
# Confusion Matrix and accuracy
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_lreg_pred)
print('Confusion Matrix => \n',cm,'\n')
print('Accuracy Score =>',accuracy_score(y_test, y_lreg_pred),'\n')

# Precision and recall
from sklearn.metrics import classification_report
print('Classification Report =>\n',classification_report(y_test,y_lreg_pred)) 

# 2- KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [33]:
#Regularization
# Dummies
# scaler
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = Pipeline([('scalre', StandardScaler())   #First: StandardScaler
                       #,('',),                         #Second:dummies
                       ,('knn', KNeighborsClassifier(n_neighbors=5))])

### Cross Validation on Logistic KNN

In [35]:
accuracies = cross_val_score(classifier_knn, X = X_train, y=y_train, cv=5)
print(accuracies)
accuracies.mean()

[0.98660085 0.99083216 0.99576869 0.99153738 0.99083216]


0.9911142454160788

#### Now we are sure that our accuracy was not based on chance, therfore we trian the model again but this on whole train data.

In [37]:
classifier_knn.fit(X_train,y_train)

y_knn_pred = classifier_knn.predict(X_test)

# Confusion Matrix and accuracy
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_knn_pred)
print('Confusion Matrix => \n',cm,'\n')
print('Accuracy Score =>',accuracy_score(y_test, y_knn_pred),'\n')

# Precision and recall
from sklearn.metrics import classification_report
print('Classification Report =>\n',classification_report(y_test,y_knn_pred)) 

Confusion Matrix => 
 [[2466    4    1    1]
 [   1  123    5    2]
 [   0    1  230    8]
 [   0    1    6  190]] 

Accuracy Score => 0.9901283316880553 

Classification Report =>
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2472
           1       0.95      0.94      0.95       131
           2       0.95      0.96      0.96       239
           3       0.95      0.96      0.95       197

    accuracy                           0.99      3039
   macro avg       0.96      0.97      0.96      3039
weighted avg       0.99      0.99      0.99      3039



In [None]:
# # Template:
# # 2- KNN

# from sklearn.neighbors import KNeighborsClassifier

# #Regularization
# # Dummies
# # scaler
# from sklearn.neighbors import KNeighborsClassifier
# classifier_knn = Pipeline([('scalre', StandardScaler())   #First: StandardScaler
#                        #,('',),                         #Second:dummies
#                        ,('knn', KNeighborsClassifier(n_neighbors=5))])

# ### Cross Validation on Logistic KNN

# accuracies = cross_val_score(classifier_knn, X = X_train, y=y_train, cv=5)
# print(accuracies)
# accuracies.mean()

# #### Now we are sure that our accuracy was not based on chance, therfore we trian the model again but this on whole train data.

# classifier_knn.fit(X_train,y_train)

# y_knn_pred = classifier_knn.predict(X_test)

# # Confusion Matrix and accuracy
# from sklearn.metrics import confusion_matrix, accuracy_score
# cm = confusion_matrix(y_test, y_knn_pred)
# print('Confusion Matrix => \n',cm,'\n')
# print('Accuracy Score =>',accuracy_score(y_test, y_knn_pred),'\n')

# # Precision and recall
# from sklearn.metrics import classification_report
# print('Classification Report =>\n',classification_report(y_test,y_knn_pred)) 

