In [None]:
!pip install sklearn

In [75]:
from sklearn.model_selection import train_test_split
from datetime import datetime
from keras.layers import Dense
from keras.models import Sequential
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from keras.utils.np_utils import to_categorical  

In [98]:
def getXY(filename):
    training_data_filename = "occupancy_data/datatraining.csv"
    data = pd.read_csv(training_data_filename)
    data = data.drop(labels=['id'], axis=1)
    data['date'] = [datetime.strptime(d, "%Y-%m-%d %H:%M:%S").timestamp() for d in data['date']] #change to date to seconds
    X = data.drop(labels=["Occupancy"], axis=1)
    Y = data['Occupancy']
    Y = to_categorical(Y,num_classes=2) #convert labels into one hot encoded arrays
    #normalize the dataset 
    X['date'] = tf.keras.utils.normalize(np.array(X['date']))[0]
    X['Temperature'] = tf.keras.utils.normalize(np.array(X['Temperature']))[0]
    X['Humidity'] = tf.keras.utils.normalize(np.array(X['Humidity']))[0]
    X['Light'] = tf.keras.utils.normalize(np.array(X['Light']))[0]
    X['CO2'] = tf.keras.utils.normalize(np.array(X['CO2']))[0]
    X['HumidityRatio'] = tf.keras.utils.normalize(np.array(X['HumidityRatio']))[0]
    return X, Y

training_data_filename = "occupancy_data/datatraining.csv"
X, Y = getXY(training_data_filename)

In [100]:
#split the dataset into training and validation
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.05, random_state=0)

In [101]:
x_train.shape, x_val.shape, y_train.shape, y_val.shape #training and validation data shapes

((7735, 6), (408, 6), (7735, 2), (408, 2))

In [102]:
x_train, y_train #training data final output

(          date  Temperature  Humidity     Light       CO2  HumidityRatio
 4355  0.011082     0.010892  0.009558  0.000000  0.007089       0.009346
 7447  0.011083     0.011004  0.014084  0.000000  0.010341       0.013988
 487   0.011080     0.011300  0.010547  0.000000  0.007097       0.010817
 771   0.011080     0.011187  0.009661  0.000000  0.007284       0.009775
 779   0.011080     0.011136  0.009617  0.000000  0.007365       0.009673
 ...        ...          ...       ...       ...       ...            ...
 4931  0.011082     0.010409  0.013137  0.000000  0.007024       0.012167
 3264  0.011081     0.010736  0.007893  0.000000  0.007056       0.007572
 1653  0.011081     0.011214  0.008421  0.000000  0.007619       0.008540
 2607  0.011081     0.011428  0.008295  0.021339  0.010706       0.008622
 2732  0.011081     0.011863  0.009196  0.024201  0.014827       0.010052
 
 [7735 rows x 6 columns],
 array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],

In [103]:
#creating the model
model = Sequential()
#hidden layer1
model.add(Dense(128, activation='relu', input_dim=6))
#output layer
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [104]:
model.summary() #output the model structure

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 128)               896       
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 258       
Total params: 1,154
Trainable params: 1,154
Non-trainable params: 0
_________________________________________________________________


In [105]:
model.fit(x_train, y_train, epochs=10) #train the model

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x17cc6348e20>

In [107]:
model.evaluate(x_val, y_val) #output how the model is performing with the validation dataset



[0.09265478700399399, 0.9852941036224365]

In [108]:
test1_filename = "occupancy_data/datatest.csv"
test2_filename = "occupancy_data/datatest2.csv"

In [109]:
X_test1, Y_test1 = getXY(test1_filename)

In [111]:
#testing the model performance with datatest.csv
model.evaluate(X_test1, Y_test1)



[0.07661452889442444, 0.988333523273468]

In [112]:
X_test2, Y_test2 = getXY(test2_filename)

In [114]:
#testing the model performance with datatest2.csv
model.evaluate(X_test2, Y_test2)



[0.07661452889442444, 0.988333523273468]

In [None]:
#I got same level of accuracy with my keras model and my Neural Network model. I was experiencing overfitting initially
#but tweaking the parameters got rid of that problem. I think maybe the dataset is not too big or that complex to lear so my Neural
#Network model didn't have any trouble fitting to it. But I can definitely see that this couldn't work that well with large datasets
#where I would have multiple classes to clasify and numerous parameters to consider. I think the main problem that could restrict or 
#constrain my neural network model is of overfitting due to the lack of any dropout layers, and also the abscense of other 
#kinds of optimizers and error functions