In [109]:
# Make division default to floating-point, saving confusion
from __future__ import division
from __future__ import print_function

# Allowed libraries
import numpy as np
import pandas as pd
import scipy as sp
import scipy.special
import heapq as pq
import matplotlib as mp
import matplotlib.pyplot as plt
import math
from itertools import product, combinations
from collections import OrderedDict as odict
import collections
from graphviz import Digraph, Graph
from tabulate import tabulate
import copy
import sys
import os
import datetime
import sklearn
import ast
from pathlib import Path
from copy import deepcopy

In [2]:

actions_dict = {'lights1': 'off', 'lights2': 'on', 'lights3': 'off', 'lights4': 'off', 'lights5': 'off',
                'lights6': 'off', 'lights7': 'off', 'lights8': 'off', 'lights9': 'off', 'lights10': 'off',
                'lights11': 'off', 'lights12': 'off', 'lights13': 'off', 'lights14': 'off', 'lights15': 'off',
                'lights16': 'off', 'lights17': 'off', 'lights18': 'off', 'lights19': 'off', 'lights20': 'off',
                'lights21': 'off', 'lights22': 'off', 'lights23': 'off', 'lights24': 'off', 'lights25': 'off',
                'lights26': 'off', 'lights27': 'off', 'lights28': 'off', 'lights29': 'off', 'lights30': 'off',
                'lights31': 'off', 'lights32': 'off', 'lights33': 'off', 'lights34': 'off', 'lights35': 'on'}


# this global state variable demonstrates how to keep track of information over multiple
# calls to get_action
state = {}


In [7]:
training_data_path = Path.cwd().parent / "spec" /"data.csv"
train_df = pd.read_csv(training_data_path, index_col=0)
train_df

Unnamed: 0,reliable_sensor1,reliable_sensor2,reliable_sensor3,reliable_sensor4,unreliable_sensor1,unreliable_sensor2,unreliable_sensor3,unreliable_sensor4,robot1,robot2,...,r32,r33,r34,r35,c1,c2,c3,c4,o1,outside
0,no motion,no motion,no motion,no motion,no motion,motion,no motion,no motion,"('r1', 0)","('r19', 0)",...,0,0,0,0,0,0,0,0,0,17
1,no motion,no motion,motion,no motion,no motion,no motion,no motion,no motion,"('r1', 0)","('r19', 0)",...,0,0,0,0,1,0,0,0,0,15
2,no motion,no motion,motion,no motion,no motion,no motion,no motion,no motion,"('r1', 0)","('r19', 0)",...,0,0,0,0,1,0,0,0,0,0
3,no motion,no motion,motion,no motion,no motion,motion,no motion,no motion,"('r1', 0)","('c3', 0)",...,0,0,0,0,5,0,0,0,0,0
4,no motion,no motion,motion,no motion,no motion,no motion,no motion,no motion,"('r1', 0)","('r20', 0)",...,0,0,0,0,11,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2396,no motion,no motion,no motion,no motion,motion,motion,no motion,motion,"('c4', 0)","('o1', 0)",...,0,0,0,0,0,0,0,0,0,23
2397,no motion,no motion,no motion,no motion,no motion,no motion,no motion,no motion,"('c4', 0)","('c4', 0)",...,0,0,0,0,0,0,0,0,0,23
2398,no motion,no motion,no motion,motion,motion,no motion,no motion,no motion,"('r35', 0)","('c4', 0)",...,0,0,0,0,0,0,0,0,0,23
2399,no motion,no motion,no motion,no motion,no motion,no motion,no motion,no motion,"('c4', 0)","('c4', 0)",...,0,0,0,0,0,0,0,0,0,23


In [3]:
# From Tutorial 2
def transposeGraph(G):
    GT = dict((v, []) for v in G)
    for v in G:
        for w in G[v]:
            GT[w].append(v)

    return GT


# From tutorial 3

def allEqualThisIndex(dict_of_arrays, **fixed_vars):
    """
    Helper function to create a boolean index vector into a tabular data structure,
    such that we return True only for rows of the table where, e.g.
    column_a=fixed_vars['column_a'] and column_b=fixed_vars['column_b'].

    This is a simple task, but it's not *quite* obvious
    for various obscure technical reasons.

    It is perhaps best explained by an example.

    >>> all_equal_this_index(
    ...    {'X': [1, 1, 0], Y: [1, 0, 1]},
    ...    X=1,
    ...    Y=1
    ... )
    [True, False, False]
    """
    # base index is a boolean vector, everywhere true
    first_array = dict_of_arrays[list(dict_of_arrays.keys())[0]]
    index = np.ones_like(first_array, dtype=np.bool_)
    for var_name, var_val in fixed_vars.items():
        index = index & (np.asarray(dict_of_arrays[var_name]) == var_val)
    return index


def estProbTable(data, var_name, parent_names, outcomeSpace, add_smooth=False, alpha=1):
    """
    Calculate a dictionary probability table by ML given
    `data`, a dictionary or dataframe of observations
    `var_name`, the column of the data to be used for the conditioned variable and
    `parent_names`, a tuple of columns to be used for the parents and
    `outcomeSpace`, a dict that maps variable names to a tuple of possible outcomes
    Return a dictionary containing an estimated conditional probability table.
    """
    var_outcomes = outcomeSpace[var_name]
    parent_outcomes = [outcomeSpace[var] for var in (parent_names)]
    # cartesian product to generate a table of all possible outcomes
    all_parent_combinations = product(*parent_outcomes)

    prob_table = odict()

    num_combs = 0
    if not add_smooth:
        alpha = 0
    else:
        num_combs = np.prod([len(x) for x in parent_outcomes]) * len(var_outcomes)
        all_parent_combinations = product(*parent_outcomes)

    for i, parent_combination in enumerate(all_parent_combinations):
        parent_vars = dict(zip(parent_names, parent_combination))
        parent_index = allEqualThisIndex(data, **parent_vars)
        counts = (parent_index.sum() + alpha * num_combs)

        for var_outcome in var_outcomes:
            if not counts:
                prob_table[tuple(list(parent_combination) + [var_outcome])] = 0
                continue
            var_index = (np.asarray(data[var_name]) == var_outcome)
            prob_table[tuple(list(parent_combination) + [var_outcome])] = \
                ((var_index & parent_index).sum() + alpha) / counts

    return {'dom': tuple(list(parent_names) + [var_name]), 'table': prob_table}


## Develop your code for learn_bayes_net(G, data, outcomeSpace) in one or more cells here

def learn_bayes_net(G, data, outcomeSpace, add_smooth=False, alpha=1) -> dict:
    cond_tables_ml = odict()
    G_T = transposeGraph(G)
    for node, parents in G_T.items():
        cond_tables_ml[node] = estProbTable(data, node, parents, outcomeSpace, add_smooth=add_smooth, alpha=alpha)
    return cond_tables_ml

In [8]:
def learn_outcome_space(data:pd.DataFrame) -> dict:
    return {var: tuple(data[var].unique()) for var in data.columns.values}

In [14]:
os_sensors = learn_outcome_space(train_df)


In [31]:
train_df["robot1"]

0        ('r1', 0)
1        ('r1', 0)
2        ('r1', 0)
3        ('r1', 0)
4        ('r1', 0)
           ...    
2396     ('c4', 0)
2397     ('c4', 0)
2398    ('r35', 0)
2399     ('c4', 0)
2400    ('r35', 0)
Name: robot1, Length: 2401, dtype: object

In [43]:
pd.DataFrame([tuple(str(k.strip("\(\) ''")) for k in x.split(',')) for x in binary_train["robot1"].tolist()], index=binary_train.index)

Unnamed: 0,0,1
0,r1,0
1,r1,0
2,r1,0
3,r1,0
4,r1,0
...,...,...
2396,c4,0
2397,c4,0
2398,r35,0
2399,c4,0


In [57]:
binary_train = train_df.copy()
binary_train.drop(columns=["robot1", "robot2", "time", "electricity_price"], inplace=True)
for col in binary_train.columns:
    if binary_train[col].dtype == "int64":
        binary_train[col] = binary_train[col] > 0
    elif "door" in col:
        binary_train[col] = binary_train[col] > 0
binary_train

Unnamed: 0,reliable_sensor1,reliable_sensor2,reliable_sensor3,reliable_sensor4,unreliable_sensor1,unreliable_sensor2,unreliable_sensor3,unreliable_sensor4,door_sensor1,door_sensor2,...,r32,r33,r34,r35,c1,c2,c3,c4,o1,outside
0,no motion,no motion,no motion,no motion,no motion,motion,no motion,no motion,False,False,...,False,False,False,False,False,False,False,False,False,True
1,no motion,no motion,motion,no motion,no motion,no motion,no motion,no motion,False,False,...,False,False,False,False,True,False,False,False,False,True
2,no motion,no motion,motion,no motion,no motion,no motion,no motion,no motion,False,False,...,False,False,False,False,True,False,False,False,False,False
3,no motion,no motion,motion,no motion,no motion,motion,no motion,no motion,False,False,...,False,False,False,False,True,False,False,False,False,False
4,no motion,no motion,motion,no motion,no motion,no motion,no motion,no motion,False,True,...,False,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2396,no motion,no motion,no motion,no motion,motion,motion,no motion,motion,False,False,...,False,False,False,False,False,False,False,False,False,True
2397,no motion,no motion,no motion,no motion,no motion,no motion,no motion,no motion,False,False,...,False,False,False,False,False,False,False,False,False,True
2398,no motion,no motion,no motion,motion,motion,no motion,no motion,no motion,False,False,...,False,False,False,False,False,False,False,False,False,True
2399,no motion,no motion,no motion,no motion,no motion,no motion,no motion,no motion,False,False,...,False,False,False,False,False,False,False,False,False,True


In [92]:
rooms = [l.replace("lights", "r") for l in lights]
rooms.extend(["outside", "o1", "c1", "c2", "c3", "c4"])
rooms

['r1',
 'r2',
 'r3',
 'r4',
 'r5',
 'r6',
 'r7',
 'r8',
 'r9',
 'r10',
 'r11',
 'r12',
 'r13',
 'r14',
 'r15',
 'r16',
 'r17',
 'r18',
 'r19',
 'r20',
 'r21',
 'r22',
 'r23',
 'r24',
 'r25',
 'r26',
 'r27',
 'r28',
 'r29',
 'r30',
 'r31',
 'r32',
 'r33',
 'r34',
 'r35',
 'outside',
 'o1',
 'c1',
 'c2',
 'c3',
 'c4']

In [87]:
sensors = list(os_sensors.keys())
lights = list(actions_dict.keys())
outcome_space_lights = {k: ['on', 'off'] for k in lights}
outcome_space_rooms = {k: [True, False] for k in rooms}
outcome_space_sensors = {k.replace("lights", "r"): ['motion', 'no motion'] for k in sensors if "reliable" in k}

outcome_space = {**outcome_space_sensors, **outcome_space_rooms}

In [59]:
outcome_space = learn_outcome_space(binary_train)
outcome_space

{'reliable_sensor1': ('no motion', 'motion'),
 'reliable_sensor2': ('no motion', 'motion'),
 'reliable_sensor3': ('no motion', 'motion'),
 'reliable_sensor4': ('no motion', 'motion'),
 'unreliable_sensor1': ('no motion', 'motion'),
 'unreliable_sensor2': ('motion', 'no motion'),
 'unreliable_sensor3': ('no motion', 'motion'),
 'unreliable_sensor4': ('no motion', 'motion'),
 'door_sensor1': (False, True),
 'door_sensor2': (False, True),
 'door_sensor3': (False, True),
 'door_sensor4': (False, True),
 'r1': (False, True),
 'r2': (False, True),
 'r3': (False, True),
 'r4': (False, True),
 'r5': (False, True),
 'r6': (False, True),
 'r7': (False, True),
 'r8': (False, True),
 'r9': (False, True),
 'r10': (False, True),
 'r11': (False, True),
 'r12': (True, False),
 'r13': (False, True),
 'r14': (False, True),
 'r15': (False, True),
 'r16': (False, True),
 'r17': (False, True),
 'r18': (False, True),
 'r19': (False, True),
 'r20': (False, True),
 'r21': (False, True),
 'r22': (True, False),

In [4]:

us3 = {
    "unreliable_sensor3": ["lights1"],
}


In [60]:
estProbTable(binary_train[["unreliable_sensor3","r1"]], "r1", list(us3.keys()), outcome_space, )

{'dom': ('unreliable_sensor3', 'r1'),
 'table': OrderedDict([(('no motion', False), 0.93125),
              (('no motion', True), 0.06875),
              (('motion', False), 0.10131852879944483),
              (('motion', True), 0.8986814712005552)])}

In [81]:
networks = [
    # {"unreliable_sensor1": ["r"]},
    # {"unreliable_sensor2": ["r"]},
    {"unreliable_sensor3": ["r1"]},
    {"unreliable_sensor4": ["r24"]},
    {"reliable_sensor1": ["r16"]},
    {"reliable_sensor2": ["r5"]},
    {"reliable_sensor3": ["r35"]},
    {"reliable_sensor4": ["r31"]},
    {"door_sensor1": ["r8", "r9"]},
    # {"door_sensor2": ["r"]},
    {"door_sensor3": ["r26", "r27"]},
    {"door_sensor4": ["r35"]},
]
networks

[{'unreliable_sensor3': ['r1']},
 {'unreliable_sensor4': ['r24']},
 {'reliable_sensor1': ['r16']},
 {'reliable_sensor2': ['r5']},
 {'reliable_sensor3': ['r35']},
 {'reliable_sensor4': ['r31']},
 {'door_sensor1': ['r8', 'r9']},
 {'door_sensor3': ['r26', 'r27']},
 {'door_sensor4': ['r35']}]

In [82]:
def transposeGraph(G):
#     GT = dict((v, []) for v in G)
    GT = collections.defaultdict(list)
    for v in G:
        for w in G[v]:
            GT[w].append(v)

    return GT

In [83]:
transposeGraph(networks[0])

defaultdict(list, {'r1': ['unreliable_sensor3']})

In [85]:
prob_tables = {}
for net in networks:
    prob_tables[list(net.keys())[0]] = learn_bayes_net(net, binary_train, outcome_space, add_smooth=True, alpha=1)
prob_tables

{'unreliable_sensor3': OrderedDict([('r1',
               {'dom': ('unreliable_sensor3', 'r1'),
                'table': OrderedDict([(('no motion', False),
                              0.9284232365145229),
                             (('no motion', True), 0.06950207468879668),
                             (('motion', False), 0.10173010380622838),
                             (('motion', True), 0.8968858131487889)])})]),
 'unreliable_sensor4': OrderedDict([('r24',
               {'dom': ('unreliable_sensor4', 'r24'),
                'table': OrderedDict([(('no motion', False),
                              0.9944048830111902),
                             (('no motion', True), 0.004577822990844354),
                             (('motion', False), 0.7426636568848759),
                             (('motion', True), 0.2528216704288939)])})]),
 'reliable_sensor1': OrderedDict([('r16',
               {'dom': ('reliable_sensor1', 'r16'),
                'table': OrderedDict([(('no motion

In [105]:
def prob_to_df(prob_table, single=False) -> dict:
    p_t = deepcopy(prob_table)
    col_name = "P"
    if single:
        return pd.DataFrame.from_dict(p_t['table'], orient='index', columns=[col_name])
    else:
        for node in p_t.keys():
            p_t[node]['table'] = pd.DataFrame.from_dict(p_t[node]['table'], orient='index', columns=[col_name])
    return p_t

def markov_blanket(G, node):
    children = G[node]

    GT = transposeGraph(G)
    parents = GT[node]

    spouse = []
    for child in children:
        spouse.extend(GT[child])

    blanket_nodes = [*children, *parents, *spouse]
    return list(set(blanket_nodes))

In [115]:

def assess_bayes_net(G, prob_tables, data, outcomeSpace, class_var, smoothing=True) -> float:
    G_T = transposeGraph(G)
    children = []
    if class_var in G:
        children = G[class_var]
    elif class_var in G_T:
        children = G_T[class_var]
    pred = np.ones([data.shape[0], len(outcomeSpace[class_var])])
    
#     children = G_T[class_var]

    index_map = {i: outcome for i, outcome in enumerate(outcomeSpace[class_var])}

    prob_t = prob_to_df(prob_tables)
    for i, outcome in index_map.items():
        query_params = [*G_T[class_var], class_var]
        test_data = data[query_params].copy(deep=True)
        test_data[class_var] = outcome
        probs = prob_t[class_var]['table'].loc[test_data.to_records(index=False)].values.flatten()
        if smoothing:
            pred[:, i] = np.log(probs)
        else:
            pred[:, i] = probs
        for child in children:
            query_params = [*G_T[child], child]
            test_data = data[query_params].copy(deep=True)
            test_data[class_var] = outcome
            probs = prob_t[child]['table'].loc[test_data.to_records(index=False)].values.flatten()
            if smoothing:
                pred[:, i] += np.log(probs)
            else:
                pred[:, i] = np.multiply(pred[:, i], probs)

    predits = np.argmax(pred, axis=1)

    df_map = pd.DataFrame.from_dict(index_map, orient='index', columns=[class_var])
    df_map.reset_index(inplace=True)
    df_map.set_index([class_var], inplace=True)
    print(df_map)
    acc = np.sum(df_map.loc[data[class_var].values].values.flatten() == predits)/data.shape[0]


    return acc

In [116]:
def k_fold_split(num_samples, k=10):
    # Yeilds the test index to be excluded
    indices = np.arange(num_samples, dtype=int)
    np.random.shuffle(indices)
    indices = np.array_split(indices, k)
    while indices:
        yield indices.pop()

def cv_bayes_net(G, data, class_var, k=10, add_smooth=True, alpha=1):

    k_fold_idx = k_fold_split(data.shape[0], k)
    accuracy = np.zeros(k)
    for i in range(k):
        test_idx = next(k_fold_idx)
        train_idx = data.index.difference(test_idx)

        test_set = data.loc[test_idx]
        train_set = data.loc[train_idx]

        outcomeSpace = learn_outcome_space(train_set)
        prob_tables = learn_bayes_net(G, train_set, outcomeSpace, add_smooth=add_smooth, alpha=alpha)

        accuracy[i] = assess_bayes_net(G, prob_tables, test_set, outcomeSpace, class_var, smoothing=add_smooth)

    return accuracy.mean(), accuracy.std()
############
## TEST CODE

In [117]:
test_data = binary_train[binary_train.columns.difference(list(set(rooms).difference({"r1"})))].copy()

acc, stddev = cv_bayes_net(networks[0], test_data, "r1", add_smooth=True)
acc, stddev

       index
r1          
False      0
True       1


IndexError: Boolean index has wrong length: 240 instead of 2

In [118]:
networks[0]

{'unreliable_sensor3': ['r1']}