# Analysis of toilet visit time distribution

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import ast
import numpy as np
import scipy.stats as stats
from math import ceil

### Import data

In [None]:
filename = Path('./data/processed/20210818_evening_toilet_visits.csv')
df = pd.read_csv(filename, parse_dates=['start_time', 'end_time'])
visit_times = df['visit_time'][df['visit_time'].notnull()]

### Plot histogram for several bin counts

In [None]:
plt.rcParams['figure.figsize'] = [22.5,12]

bin_counts = [10, 20, 40]
alpha_step = 0.6/(len(bin_counts) - 1)
for ii, bin_count in enumerate(bin_counts):
    hist = plt.hist(visit_times, bins=bin_count, density=True, label=f'Bin count = {bin_count}', alpha=1-ii*alpha_step)
    
plt.text(0.5,0.5, f'Sample size = {len(visit_times)}', horizontalalignment='center', verticalalignment='center', transform = plt.gca().transAxes, fontsize=14)
leg = plt.legend(fontsize=14)

### Determine the goodness-of-fit of the different fitted distribution
- ks = Kolmogorov smirnov (p-value)
- cvm = Cramer-von Mises Tests (p-value) (more powerfull than ks as ks assumes the data to be normally distributed)
- p = Parameter count of the distribution
- pear = Pearson rank correlation (used by Yufei)


In [None]:
%%capture --no-stdout

def process_line(line):
    line = line.strip()
    start_ind = 0
    parts = []
    for ii in range(3):
        comma_index = line.find(',', start_ind)
        value = line[start_ind:comma_index].strip()
        if ii == 1 or ii == 2:
            value = float(value)        
        start_ind = comma_index + 1
        parts.append(value)
        
    parts.append(ast.literal_eval(line[start_ind:].strip()))
    return parts

with open(Path('./data/stats/toilet_fits.txt'), 'r') as f:
    lines = [process_line(line) for line in f if '#' not in line]

min_value = min(visit_times)
max_value = max(visit_times)    

y, x = np.histogram(visit_times, bins=10, density=True)
bin_size = x[1] - x[0]
x = x[1:] - bin_size

distrs = []
name_length_max = 0
for ii, distr_data in enumerate(lines):
    res = stats.cramervonmises(visit_times, distr_data[0], args=distr_data[3])
    dist = getattr(stats, distr_data[0])(*distr_data[3])
    y_dist = dist.pdf(x)    
    try:
        r, p_pearson = stats.pearsonr(y, y_dist)   
    except:
        r = np.nan
        p_pearson = np.nan
    distrs.append((distr_data[0], distr_data[2], res.pvalue, distr_data[3], r, p_pearson))
    
    if len(distr_data[0]) > name_length_max:
        name_length_max = len(distr_data[0])

distrs.sort(key=lambda el: el[2], reverse=True)        
    
print(f'{"name":>{name_length_max}s} - {"ks":>6s} - {"cvm":>6s} - p - {"pear_r":>6s} - {"pear_p":>6s}')
for name, ks_p_value, cvm_p_value, params, r, p_pearson in distrs:    
    print(f'{name:>{name_length_max}s} - {ks_p_value:.4f} - {cvm_p_value:.4f} - {len(params)} - {r:.4f} - {p_pearson:.4f}')

### Plots showing the fit of the distributions, truncated using a minimum an maximum value based on the data

In [None]:
plt.rcParams['figure.figsize'] = [28.5,180]

col_count = 5
row_count = int(ceil(len(lines)/col_count))
_, axes = plt.subplots(row_count, col_count)

axes = axes.flatten()

import scipy.stats as stats

min_value = 10 #min(visit_times)
max_value = 450 #max(visit_times)

x_per_lim = 0.0001

for ii, distr_data in enumerate(distrs):
    name, ks_p_value, cvm_p_value, params, p_r, p_p = distr_data
    for jj, bin_count in enumerate(bin_counts):
        hist = axes[ii].hist(visit_times, bins=bin_count, density=True, label=f'Bin count = {bin_count}', alpha=1-jj*alpha_step)
    
    dist = getattr(stats, name)(*params)
    x = np.arange(max(min_value, dist.ppf(x_per_lim)), min(max_value,dist.ppf(1-x_per_lim)), 10)
    axes[ii].plot(x, dist.pdf(x), linewidth=4, label='Pdf')
    axes[ii].set_title(f'{name} - p = {cvm_p_value:.4f} : {ks_p_value:.4f}')
    leg = axes[ii].legend(fontsize=14)
