In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, LeaveOneOut, KFold
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score

import pandas as pd
import numpy as np

In [35]:
df = pd.read_csv("E:\\research\\Spacematch\\pilot study\\feedback_form20210903.csv") #import the 34 data-points

In [36]:
df.head() 

Unnamed: 0,num,Time,Clothing_Level,indoor_temperature,indoor_humidity,weather,outdoor_temperature,outdoor_humidity,Thermal_comfort
0,1,6:12:00 PM,1,70.47,46.77,sunny,78,45,2
1,2,6:16:00 PM,1,70.49,46.78,sunny,78,45,3
2,3,6:18:00 PM,1,70.6,46.65,sunny,78,45,3
3,4,6:19:00 PM,1,70.64,46.6,sunny,78,45,3
4,5,6:21:00 PM,1,70.71,46.61,sunny,78,45,3


In [37]:
y = df.Thermal_comfort
X = df.drop(["num","Thermal_comfort", "Time", "weather"], axis = 1)  #Get X and y

In [38]:
# "min_samples_leaf":range(1,10,1), 
#"min_samples_split":range(2,10,1),   
#"criterion"        :["gini", "entropy"]

In [22]:
param_test = {"n_estimators"     :range(1,50,1),
              "max_features"     :range(1,6,1), 
              "max_depth"        :range(1,6,1),
              } # Optimize the parameters
gsearch = GridSearchCV(estimator = RandomForestClassifier(random_state = 10, class_weight = "balanced", 
                    min_samples_split = 2, min_samples_leaf = 1), 
            param_grid = param_test, cv = 2)
gsearch.fit(X, y)

GridSearchCV(cv=2,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              random_state=10),
             param_grid={'max_depth': range(1, 6), 'max_features': range(1, 6),
                         'n_estimators': range(1, 50)})

In [39]:
estimator     = gsearch.best_params_["n_estimators"]
feature       = gsearch.best_params_["max_features"]
depth         = gsearch.best_params_["max_depth"]
para = (estimator, feature, depth, gsearch.best_score_)
print(para)

(6, 3, 5, 0.7647058823529411)


In [46]:
loo = LeaveOneOut() # test the accuracy with leaveoneout
y_true = []
y_pred = []
for train_index, test_index in loo.split(X): # Split in X
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    rfc = RandomForestClassifier(n_estimators = estimator, 
                                max_features = feature, 
                                min_samples_split = 2, 
                                max_depth = depth,
                                min_samples_leaf = 1,
                                criterion = "gini",
                                random_state = 10)
    rfc = rfc.fit(X_train, y_train) # train model
    predict_y = rfc.predict(X_test)
    y_true.append(y_test[0])
    y_pred.append(predict_y[0])
print(y_true)
print(y_pred)
accuracy = accuracy_score(y_true, y_pred)
print("accuracy is:", accuracy)

[2, 3, 3, 3, 3, 2, 2, 4, 4, 4, 3, 3, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 3, 3, 2, 3, 3, 3, 4, 5, 5, 5, 5]
[3, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 5, 5, 5, 5, 5]
accuracy is: 0.5


In [48]:
kf = KFold(n_splits = 10, random_state = 10, shuffle = True) # test the accuracy with Kfold
accuracy = []
for train_index, test_index in kf.split(X): # Split in X
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    rfc = RandomForestClassifier(n_estimators = estimator, 
                                max_features = feature, 
                                min_samples_split = 2, 
                                max_depth = depth,
                                min_samples_leaf = 1,
                                criterion = "gini",
                                random_state = 10)
    rfc = rfc.fit(X_train, y_train)
    predict_y = rfc.predict(X_test)
    accuracy.append(accuracy_score(y_test, predict_y))
print(np.mean(accuracy))

0.575


In [49]:
# import the realtime room data
rt_data = pd.read_csv("E:\\research\\Spacematch\\pilot study\\real_time_room_data.csv")
rt_data = rt_data.drop(["num", "Time", "weather"], axis = 1)
rt_data.head()

Unnamed: 0,Clothing_Level,indoor_temperature,indoor_humidity,outdoor_temperature,outdoor_humidity
0,1,69.03,49.55,88,55
1,1,70.08,48.02,88,55
2,1,73.71,46.4,88,55
3,1,76.6,43.16,88,55


In [52]:
rfc = RandomForestClassifier(n_estimators = estimator,  #predict the probability of different feels in different rooms
                                max_features = feature, 
                                min_samples_split = 2, 
                                max_depth = depth,
                                min_samples_leaf = 1,
                                criterion = "gini",
                                random_state = 10)
rfc = rfc.fit(X, y)
rt_prepro = rfc.predict_proba(rt_data)
print("probability : \n", rt_prepro)
print("feature importance:", rfc.feature_importances_)
print("feature name:", rt_data.columns.values.tolist())

room1_comfort1 = rt_prepro[0][0] + rt_prepro[0][4]
room1_comfort2 = rt_prepro[0][1] + rt_prepro[0][3] 
room1_comfort3 = rt_prepro[0][2]

room2_comfort1 = rt_prepro[1][0] + rt_prepro[1][4]
room2_comfort2 = rt_prepro[1][1] + rt_prepro[1][3]
room2_comfort3 = rt_prepro[1][2]

room3_comfort1 = rt_prepro[2][0] + rt_prepro[2][4]
room3_comfort2 = rt_prepro[2][1] + rt_prepro[2][3]
room3_comfort3 = rt_prepro[2][2] 

room4_comfort1 = rt_prepro[3][0] + rt_prepro[3][4]
room4_comfort2 = rt_prepro[3][1] + rt_prepro[3][3]
room4_comfort3 = rt_prepro[3][2]

print("the probability of not comfort in ROOM 1:", room1_comfort1)
print("the probability of a bit comfort in ROOM 1:", room1_comfort2)
print("the probability of very comfort in ROOM 1:", room1_comfort3)
print("the probability of not comfort in ROOM 2:", room2_comfort1)
print("the probability of a bit comfort in ROOM 2:", room2_comfort2)
print("the probability of very comfort in ROOM 2:", room2_comfort3)
print("the probability of not comfort in ROOM 3:", room3_comfort1)
print("the probability of a bit comfort in ROOM 3:", room3_comfort2)
print("the probability of very comfort in ROOM 3:", room3_comfort3)
print("the probability of not comfort in ROOM 4:", room4_comfort1)
print("the probability of a bit comfort in ROOM 4:", room4_comfort2)
print("the probability of very comfort in ROOM 4:", room4_comfort3)

probability : 
 [[0.77777778 0.05555556 0.         0.16666667 0.        ]
 [0.         0.66666667 0.         0.16666667 0.16666667]
 [0.         0.         0.         0.16666667 0.83333333]
 [0.         0.         0.         0.16666667 0.83333333]]
feature importance: [0.         0.69069871 0.16222597 0.         0.14707532]
feature name: ['Clothing_Level', 'indoor_temperature', 'indoor_humidity', 'outdoor_temperature', 'outdoor_humidity']
the probability of not comfort in ROOM 1: 0.7777777777777777
the probability of a bit comfort in ROOM 1: 0.2222222222222222
the probability of very comfort in ROOM 1: 0.0
the probability of not comfort in ROOM 2: 0.16666666666666666
the probability of a bit comfort in ROOM 2: 0.8333333333333333
the probability of very comfort in ROOM 2: 0.0
the probability of not comfort in ROOM 3: 0.8333333333333334
the probability of a bit comfort in ROOM 3: 0.16666666666666666
the probability of very comfort in ROOM 3: 0.0
the probability of not comfort in ROOM 4: 