In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import category_encoders as cs
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import roc_auc_score

In [29]:
OccupancyData = pd.read_csv("occupancy_sensor_data.csv",index_col=0)

w_features=OccupancyData.loc[:,["Temperature","Humidity","Light","CO2","HumidityRatio"]].copy()
w_target=OccupancyData.loc[:,"Occupancy"].copy()

w_features.columns

Index([u'Temperature', u'Humidity', u'Light', u'CO2', u'HumidityRatio'], dtype='object')

In [30]:
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames in this wise manner yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

def feature_process(features):

    w_features_num = features;

    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(list(w_features_num))),
            ('imputer', Imputer(strategy="median")),
            ('std_scaler', StandardScaler(with_mean = False)),
        ])

    full_pipeline = num_pipeline
    return full_pipeline

In [31]:
full_pipeline = feature_process(w_features)

feature_prepared = pd.DataFrame(data=full_pipeline.fit_transform(w_features),index=np.arange(1,len(w_features) + 1))

train_data = feature_prepared[:int(0.8 * len(feature_prepared))]
test_data = feature_prepared[int(0.8 * len(feature_prepared)):]
train_results = w_target[:int(0.8 * len(w_target))]
test_results = w_target[int(0.8 * len(w_target)):]

In [44]:
# Decision Trees Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


params = {'max_leaf_nodes': list(range(2, 5)), 'min_samples_split': [2,3,4,5]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1)
grid_search_cv.fit(train_data, train_results)
print grid_search_cv.best_estimator_

T_predict = grid_search_cv.predict(test_data)
T_predict_prop = grid_search_cv.predict_proba(test_data)

print('_________________###################____________________')
print("The prediction accuracy using the decision tree is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))
print('******************************************************* ')
# Area under the ROC score: 1 is perfect prediction
print("The area under the ROC score using the decision tree is : {:.2f}.".format(roc_auc_score(test_results,T_predict_prop[:,1]))) 
print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& ')
fi1 = zip(w_features.columns, grid_search_cv.best_estimator_.feature_importances_)
fi1.sort(key = lambda x:-x[1])
pd.DataFrame(fi1, columns=["Feature","Importance"])


Fitting 3 folds for each of 12 candidates, totalling 36 fits
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=4, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best')
[[0.00752351 0.99247649]
 [0.00752351 0.99247649]
 [0.00752351 0.99247649]
 ...
 [0.99801607 0.00198393]
 [0.99801607 0.00198393]
 [0.99801607 0.00198393]]
_________________###################____________________
The prediction accuracy using the decision tree is : 99.56%.
******************************************************* 
The area under the ROC score using the decision tree is : 1.00.
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& 


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    0.6s finished


Unnamed: 0,Feature,Importance
0,Light,0.995019
1,CO2,0.003514
2,Humidity,0.001467
3,Temperature,0.0
4,HumidityRatio,0.0


In [39]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(RandomForestClassifier(random_state=42), params, n_jobs=-1, verbose=1)
grid_search_cv.fit(train_data, train_results)
print grid_search_cv.best_estimator_

T_predict = grid_search_cv.predict(test_data)
T_predict_prop = grid_search_cv.predict_proba(test_data)
print('_________________###################____________________')
print("The prediction accuracy using the Random Forest is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))
print('******************************************************* ')
# Area under the ROC score: 1 is perfect prediction
print("The area under the ROC score using the Random Forest is : {:.2f}.".format(roc_auc_score(test_results,T_predict_prop[:,1]))) 
print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& ')
fi1 = zip(w_features.columns, grid_search_cv.best_estimator_.feature_importances_)
fi1.sort(key = lambda x:-x[1])
pd.DataFrame(fi1, columns=["Feature","Importance"])

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   17.3s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=7,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
_________________###################____________________
The prediction accuracy using the Random Forest is : 99.56%.
******************************************************* 
The area under the ROC score using the Random Forest is : 0.99.
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& 


[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed:   44.2s finished


Unnamed: 0,Feature,Importance
0,Light,0.620348
1,Temperature,0.19615
2,CO2,0.166454
3,Humidity,0.011278
4,HumidityRatio,0.00577


In [40]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(train_data, train_results)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

T_predict = model.predict(test_data)
T_predict_prop = grid_search_cv.predict_proba(test_data)
print model
print('_________________###################____________________')
print("The prediction accuracy using the Naive Bayes is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))
print('******************************************************* ')
# Area under the ROC score: 1 is perfect prediction
print("The area under the ROC score using the Navie Bayes is : {:.2f}.".format(roc_auc_score(test_results,T_predict_prop[:,1]))) 
print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& ')

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
_________________###################____________________
The prediction accuracy using the Naive Bayes is : 97.14%.
******************************************************* 
The area under the ROC score using the Navie Bayes is : 0.99.
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& 


In [41]:
# Perception 
from sklearn.linear_model import Perceptron

model = Perceptron(alpha=1)
model.fit(train_data, train_results)

T_predict = model.predict(test_data)
T_predict_prop = grid_search_cv.predict_proba(test_data)
print model
print('_________________###################____________________')
print("The prediction accuracy using the Perception is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))
print('******************************************************* ')
# Area under the ROC score: 1 is perfect prediction
print("The area under the ROC score using the Perception is : {:.2f}.".format(roc_auc_score(test_results,T_predict_prop[:,1]))) 
print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& ')

Perceptron(alpha=1, class_weight=None, eta0=1.0, fit_intercept=True,
      max_iter=None, n_iter=None, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)
_________________###################____________________
The prediction accuracy using the Perception is : 99.65%.
******************************************************* 
The area under the ROC score using the Perception is : 0.99.
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& 


In [42]:
# Nearest Neighbour Classifer

from sklearn.neighbors import KNeighborsClassifier


params = {'leaf_size': list(range(10, 50)), 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]}
grid_search_cv = GridSearchCV(KNeighborsClassifier(), params, n_jobs=-1, verbose=1)
grid_search_cv.fit(train_data, train_results)
print grid_search_cv.best_estimator_


T_predict = grid_search_cv.predict(test_data)
T_predict_prop = grid_search_cv.predict_proba(test_data)
print('_________________###################____________________')
print("The prediction accuracy using the Nearest Neighbour Classifer is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))
print('******************************************************* ')
# Area under the ROC score: 1 is perfect prediction
print("The area under the ROC score using Nearest Neighbour Classifer is : {:.2f}.".format(roc_auc_score(test_results,T_predict_prop[:,1]))) 
print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& ')

Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  1.0min finished


KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')
_________________###################____________________
The prediction accuracy using the Nearest Neighbour Classifer is : 95.30%.
******************************************************* 
The area under the ROC score using Nearest Neighbour Classifer is : 0.98.
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& 


In [43]:
# SVC

from sklearn.svm import SVC
from sklearn.model_selection import KFold

param_grid = [
        {'kernel': ['rbf'], 'C': [ 2**x for x in range(0,6) ]},
    ]
inner_cv = KFold(n_splits=3, shuffle=True, random_state=1)
grid_search = GridSearchCV(SVC(), param_grid, cv=inner_cv,  n_jobs=1, scoring='accuracy',verbose=3)
grid_search.fit(train_data, train_results)
clf=grid_search.best_estimator_


T_predict = grid_search.predict(test_data)
T_predict_prop = grid_search_cv.predict_proba(test_data)
print grid_search
print('_________________###################____________________')
print("The prediction accuracy using the SVC is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))
print('******************************************************* ')
print("The area under the ROC score using the SVC is : {:.2f}.".format(roc_auc_score(test_results,T_predict_prop[:,1]))) 
print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& ')

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] kernel=rbf, C=1 .................................................
[CV] ............ kernel=rbf, C=1, score=0.985983355234, total=   0.2s
[CV] kernel=rbf, C=1 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ............ kernel=rbf, C=1, score=0.986199342826, total=   0.2s
[CV] kernel=rbf, C=1 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ............ kernel=rbf, C=1, score=0.989047097481, total=   0.2s
[CV] kernel=rbf, C=2 .................................................
[CV] ............ kernel=rbf, C=2, score=0.985983355234, total=   0.2s
[CV] kernel=rbf, C=2 .................................................
[CV] ............ kernel=rbf, C=2, score=0.986199342826, total=   0.1s
[CV] kernel=rbf, C=2 .................................................
[CV] ............ kernel=rbf, C=2, score=0.989047097481, total=   0.1s
[CV] kernel=rbf, C=4 .................................................
[CV] ............ kernel=rbf, C=4, score=0.986202365309, total=   0.1s
[CV] kernel=rbf, C=4 .................................................
[CV] ............ kernel=rbf, C=4, score=0.986199342826, total=   0.2s
[CV] kernel=rbf, C=4 .................................................
[CV] ............ kernel=rbf, C=4, score=0.989266155531, total=   0.2s
[CV] kernel=rbf, C=8 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    3.8s finished


GridSearchCV(cv=KFold(n_splits=3, random_state=1, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf'], 'C': [1, 2, 4, 8, 16, 32]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)
_________________###################____________________
The prediction accuracy using the SVC is : 95.06%.
******************************************************* 
The area under the ROC score using the SVC is : 0.98.
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& 
