# Occupancy Detection

Create a classification model to determine if a room is occupied or unoccupied based on environmental data. 

In class demo on May 5, 2018

In [1]:
%matplotlib notebook

import os
import csv
import pickle
import numpy as np
import pandas as pd

from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer 
from sklearn.base import BaseEstimator, TransformerMixin 

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier 

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split as tts 

## Data Loading

Load data in two ways: "raw" form as dictionaries to use with the `DictVectorizer` and as a Pandas DataFrame for data exploration.

In [2]:
DATA = os.path.join("data", "occupancy.csv")
DTFMT = '%Y-%m-%d %H:%M:%S'

def load_raw(path=DATA):
    with open(path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            # Pop target off of features dictionary 
            target = row.pop('occupancy')
            
            # Convert fields to floats
            for field in ('temperature', 'relative humidity', 'C02', 'humidity', 'light'):
                row[field] = float(row[field])
            
            # Parse datetime
            row['datetime'] = datetime.strptime(row['datetime'], DTFMT)
            
            yield row, target
            
            
def load_df(path=DATA):
    return pd.read_csv(path)

In [3]:
df = load_df()

In [4]:
df.describe()

Unnamed: 0,temperature,relative humidity,light,C02,humidity
count,20560.0,20560.0,20560.0,20560.0,20560.0
mean,20.906212,27.655925,130.756622,690.553276,0.004228
std,1.055315,4.982154,210.430875,311.201281,0.000768
min,19.0,16.745,0.0,412.75,0.002674
25%,20.2,24.5,0.0,460.0,0.003719
50%,20.7,27.29,0.0,565.416667,0.004292
75%,21.525,31.29,301.0,804.666667,0.004832
max,24.408333,39.5,1697.25,2076.5,0.006476


## Transformation

1. Convert datetime into hour of day (numeric)
2. Label Encode our Class 
3. Transform dictionaries into numpy array

In [5]:
class DateEncode(BaseEstimator, TransformerMixin):
    """
    Custom transformers extend sklearn.base.BaseEstimator and TransformerMixin 
    to add helper methods like fit_transform(). It is up to you to add the 
    following methods:
    
        1. fit(X, y=None)
        2. transform(X)
    
    This transfomer encodes the datetime into hour of day and day of week features. 
    """
    
    def fit(self, X, y=None):
        """
        Expects X to be a list of dictionaries. 
        
        Loops through all dictionaries to find all unique dictionary keys 
        whose values are datetimes, in order to "learn" what fields to 
        encode date time as. 
        
        For this data, this will only be the "datetime" field, but this 
        method is added here as an example of fitting to data. 
        """
        # NOTE: properties suffixed with an underscore are internal 
        # attributes that are learned during fit 
        self.date_columns_ = set([
            key 
            for Xi in X 
            for key, val in Xi.items()
            if isinstance(val, datetime)
        ])
        
        # NOTE: fit must always return self 
        return self 
    
    def transform(self, X):
        """
        Expects X to be a list of dictionaries. 
        
        Pops (deletes) the datetime fields discovered during fit 
        and replaces it with the following features:
        
            1. field_hour : the hour of day 
            2. field_dow : the day of the week 
        
        Returns a list of dictionaries
        """
        Xprime = []
        for Xi in X:
            for col in self.date_columns_:
                dt = Xi.pop(col)
                Xi[col + "_hour"] = dt.hour 
                Xi[col + "_dow"] = dt.weekday()
            Xprime.append(Xi)
        return Xprime 

In [6]:
# Load Raw Data - data is a list of tuples [(features, target)]
# Extract the features into X and the target into y 
data = list(load_raw())
X = [row[0] for row in data]
y = [row[1] for row in data]

# Create feature extraction pipeline 
features = Pipeline([
    ('date_encode', DateEncode()),
    ('vec', DictVectorizer()), 
])

# Fit transfrom the features, which should now be a 2D array 
Xp = features.fit_transform(X)

In [7]:
# Label Encode the target, which should now be a 1D vector 
label_encoder = LabelEncoder()
yp = label_encoder.fit_transform(y)

In [8]:
# Example of getting the class name back from the encoder 
label_encoder.inverse_transform([0,1,1,0,0])

  if diff:


array(['occupied', 'unoccupied', 'unoccupied', 'occupied', 'occupied'],
      dtype='<U10')

In [9]:
# Always check the shape of X and y makes sense 
print("X shape is {} y shape is {}".format(
    Xp.shape, yp.shape
))

X shape is (20560, 7) y shape is (20560,)


## Fit a Classifier

In [10]:
from yellowbrick.classifier import ClassBalance, ConfusionMatrix, ClassificationReport
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [11]:
def simple_evaluate_model(model, X=Xp.todense(), y=yp, encoder=label_encoder):
    X_train, X_test, y_train, y_test = tts(X, y, train_size=0.80, shuffle=True)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    print("f1: {}".format(f1_score(y_test, y_hat, average='weighted')))

In [12]:
# Simple Evaluation
clf = GradientBoostingClassifier()
simple_evaluate_model(clf)



f1: 0.9908079074134394


In [13]:
# Complete Evaluation 
model = Pipeline([
    ('date_encode', DateEncode()),
    ('vec', DictVectorizer()), 
    ('clf', GradientBoostingClassifier())
])

cross_val_score(model, X, y, cv=12, scoring='f1_macro').mean()

0.9332792814271315

In [14]:
# Simpler Model
# Simple Evaluation 
clf = GradientBoostingClassifier(n_estimators=5)
simple_evaluate_model(clf, Xp.todense(), yp)

f1: 0.9876606195595065




In [15]:
cross_val_score(clf, Xp.todense(), yp, cv=12, scoring='f1_macro').mean()

0.9811118919383479

In [16]:
clf = LogisticRegression()
simple_evaluate_model(clf, Xp.todense(), yp)

f1: 0.9903461571949985




In [17]:
cross_val_score(clf, Xp.todense(), yp, cv=12, scoring='f1_macro').mean()

0.9842681993775103

In [18]:
clf = GaussianNB()
simple_evaluate_model(clf, Xp.todense(), yp)

f1: 0.9684081908555003




In [19]:
cross_val_score(clf, Xp.todense(), yp, cv=12, scoring='f1_macro').mean()

0.9262619405255568

## Model Management

In [21]:
def internal_params(estimator):
    for attr in dir(estimator):
        if attr.endswith("_") and not attr.startswith("_"):
            yield attr

In [22]:
def save_model(model, path=None):
    if path is None:
        path = model.__class__.__name__ + ".pkl"
    with open(path, 'wb') as f:
        pickle.dump(model, f)

In [24]:
list(internal_params(clf))
#save_model(clf)

['class_count_', 'class_prior_', 'classes_', 'sigma_', 'theta_']