In [10]:
import pandas as pd
from numpy import mean, std
from scipy.stats import norm

def fit_distribution(X):
    m = mean(X) #get mean for current variable
    std_dev = std(X) #get standard deviation for current variable
    #print(m, std_dev)
    dist = norm(m, std_dev)
    return dist

def probability(columns, X, prob, dist):
    p = 1
    for column in columns:
        p *= dist[column].pdf(X[column])
    return prob * p

In [11]:
def train(filename):
    df = pd.read_csv(filename)
    correct_targets = 0
    if 'id' in df.columns:
        df.drop(['id'], axis='columns',inplace=True)
    if 'date' in df.columns:
        df.drop(['date'], axis='columns',inplace=True)
    targets = df['Occupancy']
    inputs = df.drop('Occupancy', axis='columns')

    occupied_inputs = inputs[targets == 0]
    not_occupied_inputs = inputs[targets == 1]
    occupied_prob = len(occupied_inputs) / len(inputs)
    not_occupied_prob = len(not_occupied_inputs) / len(targets)

    occupied_distributions = {}
    for column in inputs.columns:
        occupied_distributions[column] = fit_distribution(occupied_inputs[column])
    
    not_occupied_distributions = {}
    for column in inputs.columns:
        not_occupied_distributions[column] = fit_distribution(not_occupied_inputs[column])
    
    correct_targets = 0
    for i in range(len(occupied_inputs)):
        p1 = probability(inputs.columns, occupied_inputs.iloc[[i]], occupied_prob, occupied_distributions) #occupied conditional prob
        p2 = probability(inputs.columns, occupied_inputs.iloc[[i]], not_occupied_prob, not_occupied_distributions) #not occupied conditional prob
        if p1 > p2:
            correct_targets += 1
    
    for i in range(len(not_occupied_inputs)):
        p1 = probability(inputs.columns, not_occupied_inputs.iloc[[i]], occupied_prob, occupied_distributions) #occupied conditional prob
        p2 = probability(inputs.columns, not_occupied_inputs.iloc[[i]], not_occupied_prob, not_occupied_distributions) #not occupied conditional prob
        if p1 < p2:
            correct_targets += 1

    return correct_targets / (len(occupied_inputs) + len(not_occupied_inputs)), occupied_distributions, not_occupied_distributions

def evaluate(test_filename, occupied_distributions, not_occupied_distributions):
    df = pd.read_csv(test_filename)
    correct_targets = 0
    if 'id' in df.columns:
        df.drop(['id'], axis='columns',inplace=True)
    if 'date' in df.columns:
        df.drop(['date'], axis='columns',inplace=True)
    targets = df['Occupancy']
    inputs = df.drop('Occupancy', axis='columns')

    occupied_inputs = inputs[targets == 0]
    not_occupied_inputs = inputs[targets == 1]
    occupied_prob = len(occupied_inputs) / len(inputs)
    not_occupied_prob = len(not_occupied_inputs) / len(targets)

    correct_targets = 0
    for i in range(len(occupied_inputs)):
        p1 = probability(inputs.columns, occupied_inputs.iloc[[i]], occupied_prob, occupied_distributions) #occupied conditional prob
        p2 = probability(inputs.columns, occupied_inputs.iloc[[i]], not_occupied_prob, not_occupied_distributions) #not occupied conditional prob
        if p1 > p2:
            correct_targets += 1
    
    for i in range(len(not_occupied_inputs)):
        p1 = probability(inputs.columns, not_occupied_inputs.iloc[[i]], occupied_prob, occupied_distributions) #occupied conditional prob
        p2 = probability(inputs.columns, not_occupied_inputs.iloc[[i]], not_occupied_prob, not_occupied_distributions) #not occupied conditional prob
        if p1 < p2:
            correct_targets += 1

    return correct_targets / (len(occupied_inputs) + len(not_occupied_inputs))

In [12]:
acc, occupied_dist, not_occupied_dist = train("./occupancy_data/datatraining.csv")
print("Training data result, Accuracy: ", acc)
print("Test 1 data result,   Accuracy: ", evaluate("./occupancy_data/datatest.csv", occupied_dist, not_occupied_dist))
print("Test 2 data result,   Accuracy: ", evaluate("./occupancy_data/datatest2.csv", occupied_dist, not_occupied_dist))

Training data result, Accuracy:  0.9788775635515167
Test 1 data result,   Accuracy:  0.9774859287054409
Test 2 data result,   Accuracy:  0.9869770303527482
