# Assignment 1: Hidden Markov Model
### Author: Jacopo Raffi

In [41]:
import pandas as pd
import hmmlearn.hmm as hmm
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv("./data/AirQualityUCI.csv", sep=';')
data = data[['Date', 'Time', 'C6H6(GT)']] # for the assignment just need one sensor column

## Preprocessing data

In [42]:
data = data.replace(',', '.', regex=True) # need to do this to convert numbers into float (values are written with ',' and not '.')
data['C6H6(GT)'] = data['C6H6(GT)'].astype(float)
data['C6H6(GT)'] = data['C6H6(GT)'].replace(-200, pd.NA) # so to avoid missing values when computing the

means = data.groupby('Time')['C6H6(GT)'].mean() # averages for each hour of the day
data['C6H6(GT)'] = data.apply(lambda row: means[row['Time']] if pd.isna(row['C6H6(GT)']) else row['C6H6(GT)'], axis=1) # change missing values

## HMM training
### Gaussian vs Mixture of Gaussian

In [43]:
seq = data['C6H6(GT)'].to_numpy()
seq = seq.reshape(-1, 1) # reshape needed for hmmlearn, 1D sequence must be in this form: [[1], [2], [3]] 

results = {'Gaussian Log-Likelihood': [],
           'Gaussian Converged': [],
           'Mix Gaussian Log-Likelihood': [],
           'Mix Gaussian Converged': [] }

In [44]:
for state in [2, 3, 5, 7]:
    gauss_model = hmm.GaussianHMM(n_components=state, n_iter=1000)
    mix_gauss_model = hmm.GMMHMM(n_components=state, n_iter=1000)

    gauss_model.fit(seq)
    gauss_converged = gauss_model.monitor_.converged
    gauss_score = gauss_model.score(seq)

    mix_gauss_model.fit(seq)
    mix_gauss_converged = gauss_model.monitor_.converged
    mix_gauss_score = mix_gauss_model.score(seq)

    results['Gaussian Log-Likelihood'].append(gauss_score)
    results['Gaussian Converged'].append(gauss_converged)
    results['Mix Gaussian Log-Likelihood'].append(mix_gauss_score)
    results['Mix Gaussian Converged'].append(mix_gauss_converged)

df = pd.DataFrame(results, index=[2, 3, 5, 7])
df.index.name = "Number of States"

In [45]:
df # 2: (high-low), 3 (high-medium-low), 5 (very high, high, medium, low, very low), 7 (very high, high, medium-high, medium, medium-low, low, very low)

# mix gaussian seems slightly better

Unnamed: 0_level_0,Gaussian Log-Likelihood,Gaussian Converged,Mix Gaussian Log-Likelihood,Mix Gaussian Converged
Number of States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,-27959.609086,True,-27959.611827,True
3,-26189.975249,True,-26189.974009,True
5,-24835.555644,True,-24790.618877,True
7,-23798.334942,True,-23651.62558,True


In [46]:
subseq_size = int(len(seq) * 0.25)
subseq = seq[-subseq_size:] # last 25 % of the sequence
v_log_prob, v_states = gauss_model.decode(subseq, algorithm='viterbi')
mps_log_prob, mps_states = gauss_model.decode(subseq, algorithm='map')

In [None]:
# for the plots use scatter and colors and plot also a single line (- - - -) for each states (of course use the same colour)