In [2]:
# importing the three main libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# importing parts of scipy
from scipy.special import logsumexp
from scipy.optimize import minimize
from scipy.stats import norm

# Reading the data file
df = pd.read_csv('data/building_1298.csv')
df.set_index(pd.to_datetime(df['datetime']), inplace=True, drop=True)
df.fillna(method='ffill', inplace=True)
df

  df.fillna(method='ffill', inplace=True)


Unnamed: 0_level_0,datetime,m0,m1,m2,m3,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-01-01 00:00:00,2016-01-01 00:00:00,416.169,1994.63,2334.99,0.0000,5.6,0.0,-0.6,0.0,1019.3,300.0,2.6
2016-01-01 01:00:00,2016-01-01 01:00:00,408.616,2101.56,2755.43,79.5127,5.6,0.0,-0.6,0.0,1019.3,300.0,2.6
2016-01-01 02:00:00,2016-01-01 02:00:00,412.072,1885.37,2564.32,0.0000,5.6,4.0,-0.6,0.0,1019.4,300.0,2.6
2016-01-01 03:00:00,2016-01-01 03:00:00,393.053,1909.73,2804.94,0.0000,5.6,4.0,-1.1,0.0,1019.4,300.0,1.5
2016-01-01 04:00:00,2016-01-01 04:00:00,404.519,1882.42,2621.65,132.9570,5.0,4.0,-2.2,0.0,1019.2,290.0,3.1
...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00,2016-12-31 19:00:00,447.916,2043.48,3037.75,787.4190,6.7,0.0,-8.3,0.0,1015.8,200.0,6.7
2016-12-31 20:00:00,2016-12-31 20:00:00,455.557,2030.27,2990.84,684.1600,5.6,4.0,-6.7,0.0,1015.3,200.0,5.7
2016-12-31 21:00:00,2016-12-31 21:00:00,461.566,2025.17,3016.03,911.1440,5.6,4.0,-6.7,0.0,1014.9,190.0,5.7
2016-12-31 22:00:00,2016-12-31 22:00:00,446.004,2098.93,2682.46,911.4310,5.6,4.0,-6.7,0.0,1014.0,190.0,6.7


In [3]:
def forward(y, a, mu, sig):
    """ Calculates the likelihood from parameters a, mu and sig
    Arguments:
        y: dependent variable [N]
        a: transition matrix [KxK]
        mu: emission means [K]
        sig: emission standard deviations [K]
    Returns:
        The total log-likelihood
    """
    N = len(y)
    logalpha = np.zeros((N,K)) # log of the forward variable defined above
    # Initialisation
    pi0 = 1/K * np.ones(K)  # initial probabilities. Supposed known here.
    logalpha[0] = np.log(pi0) + norm.logpdf(y[0], loc=mu, scale=sig)
    # Recursion
    for t in range(1, N):
        for j in range(K):
            logalpha[t,j] = logsumexp(logalpha[t-1,:] + np.log(a[:,j]) + norm.logpdf(y[t],
                            loc=mu[j], scale=sig[j]) )
    # Termination
    return logsumexp(logalpha[-1])

# Training subset
training_start = '2016-01-01'
training_end = '2016-01-31'
df_train = df.drop(df.index[(df.index < pd.to_datetime(training_start)) |(df.index > pd.to_datetime(training_end))])

# choosing meter 0 as dependent variable
df_train['y'] = df_train['m0']
# removing some outliers
#df['y'][df['m0'] < 300] = df['m0'].mean()
# normalizing y between 0 and 1
df_train['y'] = (df_train['y'] - df_train['y'].min()) / ( df_train['y'].max() - df_train['y'].min() )

def objective(x):
    # Reshaping the parameter vector x into the three variables of the forward algorithm
    a1 = np.reshape(x[:K*(K-1)], (K,K-1))   # Matrix a without the right column
    a2 = (1-a1.sum(axis=1))[:,np.newaxis]   # Right column of matrix a
    a = np.concatenate([a1, a2],axis=1)
    mu = x[K*(K-1):K*(K-1)+K]
    sig = x[K*(K-1)+K:]
    # Returns the minus log likelihood
    return -forward(df_train['y'], a, mu, sig)

# Initial parameter values to be passed to scipy.minimize()
K = 2                           # nombre d'états possibles
a_init = np.array([[0.9],[0.1]])
mu_init = [0.2, 0.6]     # valeurs moyennes des émissions
sig_init = [0.1, 0.1]      # écarts types des émissions

# Parameters are assembled into a single array x with given bounds
x0 = np.concatenate( [a_init.flatten(), mu_init, sig_init] )
bounds = (K*(K-1)*[(0,1)] + 2*K*[(0, None)])

# Training
res = minimize(objective, x0, bounds=bounds)

# Variables are recovered from the fitted x array
a1 = np.reshape(res.x[:K*(K-1)], (K,K-1))
a2 = (1-a1.sum(axis=1))[:,np.newaxis]
a = np.concatenate([a1, a2],axis=1)
mu = res.x[K*(K-1):K*(K-1)+K]
sig = res.x[K*(K-1)+K:]

y = df_train['y']
N = len(y)

z = np.zeros(N)  # hidden state to be determined
best_logp = np.zeros((N, K))  # delta in the description above
back_ptr = np.zeros((N, K))  # psi in the description above

# Initialisation
best_logp[0] = norm.logpdf(y[0], loc=mu, scale=sig)

# Recursion
for t in range(1, N):
    for k in range(K):
        logp = best_logp[t - 1] + np.log(a[:, k]) + norm.logpdf(y[t], loc=mu[k], scale=sig[k])
        best_logp[t, k] = np.max(logp)
        back_ptr[t, k] = np.argmax(logp)
        
# Backtracking
z[-1] = np.argmax(best_logp[-1])
for t in range(1, N):
    z[-1 - t] = back_ptr[-1 - t + 1, int(z[-1 - t + 1])]

  logalpha[0] = np.log(pi0) + norm.logpdf(y[0], loc=mu, scale=sig)
  logalpha[t,j] = logsumexp(logalpha[t-1,:] + np.log(a[:,j]) + norm.logpdf(y[t],
  logalpha[t,j] = logsumexp(logalpha[t-1,:] + np.log(a[:,j]) + norm.logpdf(y[t],
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  best_logp[0] = norm.logpdf(y[0], loc=mu, scale=sig)
  logp = best_logp[t - 1] + np.log(a[:, k]) + norm.logpdf(y[t], loc=mu[k], scale=sig[k])


In [4]:
y_star_mean = np.zeros(N)
y_star_std = np.zeros(N)
for k in range(K):
    y_star_mean[z == k] = mu[k]
    y_star_std[z == k] = sig[k]

y_star = np.random.normal(loc=y_star_mean, scale=y_star_std)