In [3]:
import pandas as pd
from numpy import mean, std
from scipy.stats import norm

def calculate_mean(X):
    return mean(X)

def get_votes(columns, X, occupied_means, not_occupied_means):
    occupied_score = not_occupied_score = 0
    for column in columns:
        diff_occupied = abs(occupied_means[column] - X[column])
        diff_not_occupied = abs(not_occupied_means[column] - X[column])
        if diff_occupied < diff_not_occupied:
            occupied_score += 1
        else:
            not_occupied_score += 1
    
    return 1 if occupied_score < not_occupied_score else 0

In [4]:
def train(filename):
    df = pd.read_csv(filename)
    correct_targets = 0
    if 'id' in df.columns:
        df.drop(['id'], axis='columns',inplace=True)
    if 'date' in df.columns:
        df.drop(['date'], axis='columns',inplace=True)
    targets = df['Occupancy']
    inputs = df.drop('Occupancy', axis='columns')

    occupied_inputs = inputs[targets == 0]
    not_occupied_inputs = inputs[targets == 1]
    occupied_prob = len(occupied_inputs) / len(inputs)
    not_occupied_prob = len(not_occupied_inputs) / len(targets)

    occupied_means = {}
    for column in inputs.columns:
        occupied_means[column] = calculate_mean(occupied_inputs[column])
    
    not_occupied_means = {}
    for column in inputs.columns:
        not_occupied_means[column] = calculate_mean(not_occupied_inputs[column])
    
    correct_targets = 0

    for i in range(len(occupied_inputs)):
        prediction = get_votes(inputs.columns, occupied_inputs.iloc[i], occupied_means, not_occupied_means)
        if prediction == 0:
            correct_targets += 1
    
    for i in range(len(not_occupied_inputs)):
        prediction = get_votes(inputs.columns, not_occupied_inputs.iloc[i], occupied_means, not_occupied_means)
        if prediction == 1:
            correct_targets += 1
    
    return correct_targets / (len(occupied_inputs) + len(not_occupied_inputs)), occupied_means, not_occupied_means

def evaluate(test_filename, occupied_means, not_occupied_means):
    df = pd.read_csv(test_filename)
    correct_targets = 0
    if 'id' in df.columns:
        df.drop(['id'], axis='columns',inplace=True)
    if 'date' in df.columns:
        df.drop(['date'], axis='columns',inplace=True)
    targets = df['Occupancy']
    inputs = df.drop('Occupancy', axis='columns')

    occupied_inputs = inputs[targets == 0]
    not_occupied_inputs = inputs[targets == 1]
    occupied_prob = len(occupied_inputs) / len(inputs)
    not_occupied_prob = len(not_occupied_inputs) / len(targets)

    correct_targets = 0

    for i in range(len(occupied_inputs)):
        prediction = get_votes(inputs.columns, occupied_inputs.iloc[i], occupied_means, not_occupied_means)
        if prediction == 0:
            correct_targets += 1
    
    for i in range(len(not_occupied_inputs)):
        prediction = get_votes(inputs.columns, not_occupied_inputs.iloc[i], occupied_means, not_occupied_means)
        if prediction == 1:
            correct_targets += 1
    
    return correct_targets / (len(occupied_inputs) + len(not_occupied_inputs))

In [5]:
acc, occupied_means, not_occupied_means = train("./occupancy_data/datatraining.csv")
print("Training data result, Accuracy: ", acc)
print("Test 1 data result,   Accuracy: ", evaluate("./occupancy_data/datatest.csv", occupied_means, not_occupied_means))
print("Test 2 data result,   Accuracy: ", evaluate("./occupancy_data/datatest2.csv", occupied_means, not_occupied_means))

Training data result, Accuracy:  0.9070367186540587
Test 1 data result,   Accuracy:  0.8431519699812383
Test 2 data result,   Accuracy:  0.7898892534864643
