# Assignment 1: Hidden Markov Model
### Author: Jacopo Raffi

In [28]:
import pandas as pd
import hmmlearn.hmm as hmm
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv("./data/AirQualityUCI.csv", sep=';')
data = data[['Date', 'Time', 'C6H6(GT)']] # for the assignment just need one sensor column

## Preprocessing data

In [29]:
data = data.replace(',', '.', regex=True) # need to do this to convert numbers into float (values are written with ',' and not '.')
data['C6H6(GT)'] = data['C6H6(GT)'].astype(float)
data['C6H6(GT)'] = data['C6H6(GT)'].replace(-200, pd.NA) # so to avoid missing values when computing the averages

means = data.groupby('Time')['C6H6(GT)'].mean()
data['C6H6(GT)'] = data.apply(lambda row: means[row['Time']] if pd.isna(row['C6H6(GT)']) else row['C6H6(GT)'], axis=1)

## HMM training

In [30]:
STATES = 3 # number of hidden states
seq = data['C6H6(GT)'].to_numpy()
seq = seq.reshape(-1, 1) # reshape needed for hmmlearn, 1D sequence must be in this form: [[1], [2], [3]]
seq

array([[11.9],
       [ 9.4],
       [ 9. ],
       ...,
       [12.4],
       [ 9.5],
       [11.9]])

In [31]:
gauss_model = hmm.GaussianHMM(n_components=STATES, n_iter=1000)
mix_gauss_model = hmm.GMMHMM(n_components=STATES, n_iter=1000) 

In [32]:
gauss_model.fit(seq)
gauss_model.score(seq)

-26189.97812912606

In [33]:
gauss_model.monitor_.converged # check if converged

True

In [34]:
mix_gauss_model.fit(seq)
mix_gauss_model.score(seq)

-26189.97406771059

In [35]:
mix_gauss_model.monitor_.converged # check if converged

True

In [36]:
# TODO: predict and plot with different colours the point of the timeseries (last 25%); do it for different numbers of state just to show

subseq_size = int(len(seq) * 0.25)
subseq = seq[-subseq_size:] # last 25 % of the sequence

len(subseq)

2339

In [37]:
log_prob, states = gauss_model.decode(subseq, algorithm='viterbi')