In [666]:
import pandas as pd

In [667]:
df = pd.read_csv("automation-protocol.csv")

In [668]:
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,id,instanceid,type,lastseen,onoff,color,dimmer,protocoltime
0,270,65537,2,1605867202,1,efd275,51,2020-11-20T21:58:04.000Z
1,271,65538,2,1605868423,1,efd275,48,2020-11-20T21:58:04.000Z
2,272,65540,2,1605882795,0,efd275,100,2020-11-20T21:58:04.000Z
3,273,65545,2,1605831076,0,efd275,50,2020-11-20T21:58:04.000Z
4,274,65546,2,1605892658,0,f1e0b5,50,2020-11-20T21:58:04.000Z


## Data cleaning

In [669]:
df.protocoltime = pd.to_datetime(df.protocoltime)
df.head()

Unnamed: 0,id,instanceid,type,lastseen,onoff,color,dimmer,protocoltime
0,270,65537,2,1605867202,1,efd275,51,2020-11-20 21:58:04+00:00
1,271,65538,2,1605868423,1,efd275,48,2020-11-20 21:58:04+00:00
2,272,65540,2,1605882795,0,efd275,100,2020-11-20 21:58:04+00:00
3,273,65545,2,1605831076,0,efd275,50,2020-11-20 21:58:04+00:00
4,274,65546,2,1605892658,0,f1e0b5,50,2020-11-20 21:58:04+00:00


In [670]:
df.protocoltime = df.protocoltime.round('15min')
df.head()

Unnamed: 0,id,instanceid,type,lastseen,onoff,color,dimmer,protocoltime
0,270,65537,2,1605867202,1,efd275,51,2020-11-20 22:00:00+00:00
1,271,65538,2,1605868423,1,efd275,48,2020-11-20 22:00:00+00:00
2,272,65540,2,1605882795,0,efd275,100,2020-11-20 22:00:00+00:00
3,273,65545,2,1605831076,0,efd275,50,2020-11-20 22:00:00+00:00
4,274,65546,2,1605892658,0,f1e0b5,50,2020-11-20 22:00:00+00:00


### Data enriching

In [671]:
devices = df.instanceid.unique()
devices

array([65537, 65538, 65540, 65545, 65546, 65547, 65548, 65549, 65551,
       65552, 65553, 65554, 65555, 65556, 65557])

In [672]:
absolute_min_date = df.protocoltime.min(axis=0)
absolute_max_date = df.protocoltime.max(axis=0)
print("absolute_min_date=%s, absolute_max_date=%s" % (absolute_min_date, absolute_max_date))

complete_time_data = pd.date_range(absolute_min_date, absolute_max_date, freq="15min")
complete_time_data = complete_time_data.tz_localize(None)

complete_arr = []

for device in devices:
    single_device = df[df["instanceid"] == device]
    single_device["protocoltime"] = pd.to_datetime(single_device["protocoltime"]).apply(lambda x: x.replace(tzinfo=None))
    last_state = single_device.iloc[0]
    last_state["onoff"] = 0
    for time_data in complete_time_data.values:
        if time_data in single_device.protocoltime.values:
            # TODO edge case for multiple events in one time slot
            device_row = single_device[single_device["protocoltime"] == time_data]
            complete_arr.append(device_row.values[0])
            last_state = device_row.iloc[0]
        else:
            current_state = last_state.copy()
            current_state["protocoltime"] = time_data
            complete_arr.append(current_state.values)

df = pd.DataFrame(complete_arr, columns=df.columns)
df = df.sort_values(by=["instanceid", "protocoltime"])
df.head()

absolute_min_date=2020-11-20 22:00:00+00:00, absolute_max_date=2020-11-22 21:30:00+00:00


Unnamed: 0,id,instanceid,type,lastseen,onoff,color,dimmer,protocoltime
0,270,65537,2,1605867202,1,efd275,51,2020-11-20 22:00:00
1,270,65537,2,1605867202,1,efd275,51,2020-11-20 22:15:00
2,285,65537,2,1605867202,1,efd275,51,2020-11-20 22:30:00
3,285,65537,2,1605867202,1,efd275,51,2020-11-20 22:45:00
4,304,65537,2,1605867202,0,f1e0b5,51,2020-11-20 23:00:00


In [673]:
df.color[df.color == 0] = "f1e0b5" 

In [674]:
df["weekday"] = df.protocoltime.dt.day_name()
df["weekofyear"] = df.protocoltime.dt.weekofyear
df["year"] = df.protocoltime.dt.year
df["month"] = df.protocoltime.dt.month
df["day"] = df.protocoltime.dt.day
df["time"] = df.protocoltime.dt.strftime("%H:%M")
df.head()

Unnamed: 0,id,instanceid,type,lastseen,onoff,color,dimmer,protocoltime,weekday,weekofyear,year,month,day,time
0,270,65537,2,1605867202,1,efd275,51,2020-11-20 22:00:00,Friday,47,2020,11,20,22:00
1,270,65537,2,1605867202,1,efd275,51,2020-11-20 22:15:00,Friday,47,2020,11,20,22:15
2,285,65537,2,1605867202,1,efd275,51,2020-11-20 22:30:00,Friday,47,2020,11,20,22:30
3,285,65537,2,1605867202,1,efd275,51,2020-11-20 22:45:00,Friday,47,2020,11,20,22:45
4,304,65537,2,1605867202,0,f1e0b5,51,2020-11-20 23:00:00,Friday,47,2020,11,20,23:00


In [675]:
df.columns

Index(['id', 'instanceid', 'type', 'lastseen', 'onoff', 'color', 'dimmer',
       'protocoltime', 'weekday', 'weekofyear', 'year', 'month', 'day',
       'time'],
      dtype='object')

In [676]:
df.color.unique()

array(['efd275', 'f1e0b5', 'f5faf6', '0'], dtype=object)

In [677]:
df.color.value_counts()

f1e0b5    2354
f5faf6     296
efd275     207
0            8
Name: color, dtype: int64

In [678]:
if "id" in df.columns:
    del df["id"]
if "lastseen" in df.columns:
    del df["lastseen"]
if "protocoltime" in df.columns:
    del df["protocoltime"]
if "color" in df.columns:
    del df["color"]
if "dimmer" in df.columns:
    del df["dimmer"]
df.head()

Unnamed: 0,instanceid,type,onoff,weekday,weekofyear,year,month,day,time
0,65537,2,1,Friday,47,2020,11,20,22:00
1,65537,2,1,Friday,47,2020,11,20,22:15
2,65537,2,1,Friday,47,2020,11,20,22:30
3,65537,2,1,Friday,47,2020,11,20,22:45
4,65537,2,0,Friday,47,2020,11,20,23:00


In [679]:
df.describe()

Unnamed: 0,instanceid,type,onoff,weekofyear,year,month,day
count,2865.0,2865.0,2865.0,2865.0,2865.0,2865.0,2865.0
mean,65548.533333,2.0,0.518674,47.0,2020.0,11.0,21.413613
std,6.19642,0.0,0.499738,0.0,0.0,0.0,0.571332
min,65537.0,2.0,0.0,47.0,2020.0,11.0,20.0
25%,65545.0,2.0,0.0,47.0,2020.0,11.0,21.0
50%,65549.0,2.0,1.0,47.0,2020.0,11.0,21.0
75%,65554.0,2.0,1.0,47.0,2020.0,11.0,22.0
max,65557.0,2.0,1.0,47.0,2020.0,11.0,22.0


In [680]:
df.isnull().sum()

instanceid    0
type          0
onoff         0
weekday       0
weekofyear    0
year          0
month         0
day           0
time          0
dtype: int64

In [681]:
df.time.unique()

array(['22:00', '22:15', '22:30', '22:45', '23:00', '23:15', '23:30',
       '23:45', '00:00', '00:15', '00:30', '00:45', '01:00', '01:15',
       '01:30', '01:45', '02:00', '02:15', '02:30', '02:45', '03:00',
       '03:15', '03:30', '03:45', '04:00', '04:15', '04:30', '04:45',
       '05:00', '05:15', '05:30', '05:45', '06:00', '06:15', '06:30',
       '06:45', '07:00', '07:15', '07:30', '07:45', '08:00', '08:15',
       '08:30', '08:45', '09:00', '09:15', '09:30', '09:45', '10:00',
       '10:15', '10:30', '10:45', '11:00', '11:15', '11:30', '11:45',
       '12:00', '12:15', '12:30', '12:45', '13:00', '13:15', '13:30',
       '13:45', '14:00', '14:15', '14:30', '14:45', '15:00', '15:15',
       '15:30', '15:45', '16:00', '16:15', '16:30', '16:45', '17:00',
       '17:15', '17:30', '17:45', '18:00', '18:15', '18:30', '18:45',
       '19:00', '19:15', '19:30', '19:45', '20:00', '20:15', '20:30',
       '20:45', '21:00', '21:15', '21:30', '21:45'], dtype=object)

## Data preparation

In [682]:
from sklearn.model_selection import train_test_split

In [683]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=11)

In [684]:
y_train_full_onoff = (df_train_full.onoff == 1).values
y_train_onoff = (df_train.onoff == 1).values
y_val_onoff = (df_val.onoff == 1).values
y_test_onoff = (df_test.onoff == 1).values

In [685]:
del df_train_full["onoff"]
del df_train["onoff"]
del df_val["onoff"]
del df_test["onoff"]

### One-Hot encoding

In [686]:
from sklearn.feature_extraction import DictVectorizer

In [687]:
dv = DictVectorizer(sparse=False)
dv_full = DictVectorizer(sparse=False)

In [688]:
dict_train_full_onoff = df_train_full.to_dict(orient="records")
dict_train_onoff = df_train.to_dict(orient="records")
dict_val_onoff = df_val.to_dict(orient="records")
dict_test_onoff = df_test.to_dict(orient="records")

In [689]:
X_train_full_onoff = dv_full.fit_transform(dict_train_full_onoff)
X_train_onoff = dv.fit_transform(dict_train_onoff)
X_val_onoff = dv.transform(dict_val_onoff)
X_test_onoff = dv_full.transform(dict_test_onoff)

## Training

### Logistic Regression

In [690]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

#### Optimization

In [691]:
for C in [0.001, 0.01, 0.1, 0.5, 1, 10]:
    lr = LogisticRegression(solver='liblinear', C=C, random_state=1)
    lr.fit(X_train_onoff, y_train_onoff)
    
    y_pred = lr.predict_proba(X_val_onoff)[:, 1]
    
    auc = roc_auc_score(y_val_onoff, y_pred)
    print('C=%s, auc = %0.3f' % (C, auc))


C=0.001, auc = 0.778
C=0.01, auc = 0.786
C=0.1, auc = 0.786
C=0.5, auc = 0.786
C=1, auc = 0.786
C=10, auc = 0.786


In [692]:
#### Final model

In [693]:
lr = LogisticRegression(solver='liblinear', C=0.1, random_state=1)
lr.fit(X_train_full_onoff, y_train_full_onoff)

LogisticRegression(C=0.1, random_state=1, solver='liblinear')

In [694]:
# dict(zip(dv.get_feature_names(), lr.coef_[0].round(5)))

### Decision trees

In [695]:
from sklearn.tree import DecisionTreeClassifier

#### Final model

In [696]:
dt = DecisionTreeClassifier()
dt.fit(X_train_full_onoff, y_train_full_onoff)

DecisionTreeClassifier()

### Random forest

In [697]:
from sklearn.ensemble import RandomForestClassifier

#### Final model

In [698]:
rf = RandomForestClassifier(n_estimators=10, random_state=3)
rf.fit(X_train_full_onoff, y_train_full_onoff)

RandomForestClassifier(n_estimators=10, random_state=3)

## Evaluation

### Logistic regression

In [699]:
y_pred = lr.predict_proba(X_test_onoff)[:, 1]
accuracy = (y_test_onoff == (y_pred >= 0.5)).mean()

auc = roc_auc_score(y_test_onoff, y_pred)

print("LR accuracy:", accuracy)
print("LR auc:", auc)

LR accuracy: 0.6684118673647469
LR auc: 0.7692627257398402


### Decision trees

In [700]:
y_pred = dt.predict_proba(X_test_onoff)[:, 1]
accuracy = (y_test_onoff == (y_pred >= 0.5)).mean()

auc = roc_auc_score(y_test_onoff, y_pred)

print("LR accuracy:", accuracy)
print("LR auc:", auc)

LR accuracy: 0.9354275741710296
LR auc: 0.937984164609858


### Random forest

In [701]:
y_pred = rf.predict_proba(X_test_onoff)[:, 1]
accuracy = (y_test_onoff == (y_pred >= 0.5)).mean()

auc = roc_auc_score(y_test_onoff, y_pred)

print("LR accuracy:", accuracy)
print("LR auc:", auc)

LR accuracy: 0.8795811518324608
LR auc: 0.9275982991617996


## Testing the model(s)

In [702]:
example = {
    "instanceid": 65540,
    "type": 2,
    "weekday": "Sunday",
    "weekofyear": 47,
    "year": 2020,
    "month": 11,
    "day": 22,
    "time": "13:30"
}

In [703]:
X_example = dv_full.transform([example])
pd.DataFrame(X_example, columns=dv.get_feature_names())

Unnamed: 0,day,instanceid,month,time=00:00,time=00:15,time=00:30,time=00:45,time=01:00,time=01:15,time=01:30,...,time=23:00,time=23:15,time=23:30,time=23:45,type,weekday=Friday,weekday=Saturday,weekday=Sunday,weekofyear,year
0,22.0,65540.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,47.0,2020.0


In [704]:
lr.predict_proba(X_example)[0, 1]

0.7458990197197287