# Lichen Likelihood Project: Expectation Maximization

In [121]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import os

In [122]:
unpickled_df = pd.read_pickle('element_analysis.pkl') 

In [123]:
# print columns ending in "binned"
unpickled_df.columns[unpickled_df.columns.str.endswith('binned')]

Index(['Nitrogen (% dw)_binned', 'Sulfur (% dw)_binned',
       'Phosphorous (ppm dw)_binned', 'Lead (ppm dw)_binned',
       'Copper (ppm dw)_binned', 'Chromium (ppm dw)_binned',
       'Year of tissue collection_binned', 'Air pollution score_binned',
       'Region_binned',
       'Code for scientific name and authority in lookup table_binned'],
      dtype='object')

In [124]:
unpickled_df["Region_binned"].value_counts()

Region_binned
6     6860
10     318
Name: count, dtype: int64

In [125]:
# Helpers
df = unpickled_df.copy()
def idx_map_from_series(s):
    """Return dict mapping original category"""
    cats = list(pd.Categorical(s).categories)
    mapping = {c:i for i,c in enumerate(cats)}
    return mapping, cats

def one_hot_index_from_value(mapping, val):
    return mapping[val]

def normalize_rows(mat, axis=-1, alpha=1e-2):
    """
    Normalize
    """
    arr = mat.astype(float).copy()
    arr += float(alpha)
    s = arr.sum(axis=axis, keepdims=True)
    s[s == 0] = 1.0
    return arr / s

def logsumexp(a, axis=None): # self explanatory
    a_max = np.max(a, axis=axis, keepdims=True)
    res = a_max + np.log(np.sum(np.exp(a - a_max), axis=axis, keepdims=True))
    if axis is not None:
        return np.squeeze(res, axis=axis)
    return res

In [126]:
EPS = 1e-10
class BayesianNetworkEM:
    def __init__(self, df, hidden_states=2, seed=0):
        """
        Initializes CPT tables with random values consistent with the BN structure.
        """
        np.random.seed(seed)
        self.df = df.copy().reset_index(drop=True)
        self.N = len(self.df)
        self.H = hidden_states
        # element node column names
        self.elements = [
            'Nitrogen (% dw)_binned',
            'Sulfur (% dw)_binned',
            'Phosphorous (ppm dw)_binned',
            'Lead (ppm dw)_binned',
            'Copper (ppm dw)_binned',
            'Chromium (ppm dw)_binned'
        ]
        self.year_col = 'Year of tissue collection_binned'
        self.pollution_col = 'Air pollution score_binned'
        self.region_col = 'Region_binned'
        self.species_col = 'Code for scientific name and authority in lookup table_binned'
        self.maps = {}
        self.rev = {}
        cols_for_map = self.elements + [self.year_col, self.pollution_col, self.region_col, self.species_col]
        for col in cols_for_map:
            mapping, cats = idx_map_from_series(self.df[col])
            self.maps[col] = mapping
            self.rev[col] = cats

        self.K_region = len(self.rev[self.region_col])
        self.K_year = len(self.rev[self.year_col])
        self.K_poll = len(self.rev[self.pollution_col])
        self.K_species = len(self.rev[self.species_col])
        self.K_elem = {col: len(self.rev[col]) for col in self.elements}
        self.initialize_random_CPTs()
        self.gamma = np.zeros((self.N, self.H))

    def initialize_random_CPTs(self):
        """
        Randomly initialize all CPTs:
        Ensure each CPT is normalized over its conditional domain.
        """
        self.P_H_given_RY = normalize_rows(np.random.rand(self.K_region, self.K_year, self.H), axis=2)

        self.P_Pe_given_H = normalize_rows(np.random.rand(self.H, self.K_poll), axis=1)

        # model P(Sp | H, Pe)
        self.P_Sp_given_HPe = normalize_rows(
            np.random.rand(self.H, self.K_poll, self.K_species), axis=2
        )

        # elements depend on (Pe, Sp)
        self.P_elem = {}
        for col in self.elements:
            arr = np.random.rand(self.K_poll, self.K_species, self.K_elem[col])
            self.P_elem[col] = normalize_rows(arr, axis=2)

    def e_step(self):
        """
        Performs the E-step
            For each sample n:
                Compute posterior responsibilities
            Store in self.gamma with shape [num_samples, num_hidden_states]
        """
        idx_region = self.df[self.region_col].map(self.maps[self.region_col]).values
        idx_year = self.df[self.year_col].map(self.maps[self.year_col]).values
        idx_species = self.df[self.species_col].map(self.maps[self.species_col]).values
        idx_poll = self.df[self.pollution_col].map(self.maps[self.pollution_col]).values
        self.P_region = np.bincount(idx_region) / self.N
        self.P_year   = np.bincount(idx_year) / self.N
        N = self.N
        H = self.H
        Kp = self.K_poll

        for i in range(N):
            r = idx_region[i]
            y = idx_year[i]
            sp = idx_species[i]
            p_val = idx_poll[i]
            # log P(H | r,y)
            log_ph = np.log(self.P_H_given_RY[r, y, :] + EPS)
            # log P(Pe | H)
            log_ppe = np.log(self.P_Pe_given_H[:, p_val] + EPS)
            # log P(Sp | H, Pe)
            log_psp = np.log(self.P_Sp_given_HPe[:, p_val, sp] + EPS)
            # elements are d seperated from H when Pe and Sp are observed
            log_unnorm = log_ph + log_ppe + log_psp
            log_denom = logsumexp(log_unnorm, axis=0)
            self.gamma[i, :] = np.exp(log_unnorm - log_denom)

        return self.gamma

    def compute_posterior_hidden_probs(self, sample_row):
        """
        Compute P(H=h | observed sample features) for a single sample
        Returns a vector normalized over hidden states
        """
        r = self.maps[self.region_col][sample_row[self.region_col]]
        y = self.maps[self.year_col][sample_row[self.year_col]]
        s = self.maps[self.species_col][sample_row[self.species_col]]

        log_ph = np.log(self.P_H_given_RY[r, y, :] + EPS)
        if self.pollution_col in sample_row.index and pd.notna(sample_row[self.pollution_col]):
            p_obs = self.maps[self.pollution_col][sample_row[self.pollution_col]]
            log_ppe = np.log(self.P_Pe_given_H[:, p_obs] + EPS)
            # include Sp | Pe
            log_psp_obs = np.log(self.P_Sp_given_HPe[:, p_obs, s] + EPS)
            log_unnorm = log_ph + log_ppe + log_psp_obs
            log_norm = logsumexp(log_unnorm, axis=0)
            return np.exp(log_unnorm - log_norm)

        # log_contrib_pe[h] = log P(Pe=pe | H=h) + log P(Sp=s | H,Pe=pe) + sum_k log P(elem_k | pe, s) forall (updates pe)
        Kp = self.K_poll
        log_contrib = np.zeros((Kp, self.H)) 
        elem_vals = {}
        for col in self.elements:
            elem_vals[col] = self.maps[col][sample_row[col]]
        for pe in range(Kp):
            # log P(Pe=pe | H)
            logppe = np.log(self.P_Pe_given_H[:, pe] + EPS)
            # log P(Sp = s | H, Pe=pe)
            log_psp_pe = np.log(self.P_Sp_given_HPe[:, pe, s] + EPS)
            # sum over log P(elem | pe, s)
            log_elems = 0.0
            for col in self.elements:
                log_elems += np.log(self.P_elem[col][pe, s, elem_vals[col]] + EPS)
            # log contribution per h
            log_contrib[pe, :] = logppe + log_psp_pe + log_elems

        # log_p_h = log_ph + logsum_pe [ logppe + log_psp_pe + log_elems ] (update for each hidden)
        log_sum_pe_per_h = logsumexp(log_contrib, axis=0)
        log_p_h = log_ph + log_sum_pe_per_h
        log_norm = logsumexp(log_p_h, axis=0)
        return np.exp(log_p_h - log_norm)

    def m_step(self):
        """
        Performs the M-step:
            Update CPTs:
                - P(H | Region, FieldDate)
                - P(Pollution | H)
                - P(Species | H, Pollution)
                - P(Element_i_bucket | Species, Pollution) for each element node
        Uses expected counts
        """
        # precompute indices
        idx_region = self.df[self.region_col].map(self.maps[self.region_col]).values
        idx_year = self.df[self.year_col].map(self.maps[self.year_col]).values
        idx_species = self.df[self.species_col].map(self.maps[self.species_col]).values
        idx_poll = self.df[self.pollution_col].map(self.maps[self.pollution_col]).values
        N = self.N
        H = self.H
        Kp = self.K_poll

        #P(H | Region, Year) counts
        counts_H_RY = np.zeros((self.K_region, self.K_year, H))
        for i in range(N):
            r = idx_region[i]; y = idx_year[i]
            counts_H_RY[r, y, :] += self.gamma[i, :] 
        self.P_H_given_RY = normalize_rows(counts_H_RY, axis=2)
        # P(Pe | H)
        counts_Pe_H = np.zeros((H, Kp))
        for i in range(N):
            p = idx_poll[i]
            counts_Pe_H[:, p] += self.gamma[i, :]
        # counts_Pe_H 
        self.P_Pe_given_H = normalize_rows(counts_Pe_H, axis=1)
        # P(Sp | H, Pe)
        counts_Sp_HPe = np.zeros((H, Kp, self.K_species))
        for i in range(N):
            p = idx_poll[i]
            s = idx_species[i]
            counts_Sp_HPe[:, p, s] += self.gamma[i, :]
        # normalize over species
        self.P_Sp_given_HPe = normalize_rows(counts_Sp_HPe, axis=2)

        # 4) P(elem | Pe, Sp) doesnt need gamma bc dsep
        for col in self.elements:
            Kval = self.K_elem[col]
            counts_elem = np.zeros((Kp, self.K_species, Kval))
            for i in range(N):
                p = idx_poll[i]
                s = idx_species[i]
                val = self.maps[col][self.df[col].iloc[i]]
                counts_elem[p, s, val] += 1.0
            self.P_elem[col] = normalize_rows(counts_elem, axis=2)

    def update_hidden_CPT(self):
        """
        Update P(H | Region, FieldDate)
        Accumulate weighted counts per condition using responsibilities calculated above
        """
        # identical to part of m_step
        counts_H_RY = np.zeros((self.K_region, self.K_year, self.H))
        idx_region = self.df[self.region_col].map(self.maps[self.region_col]).values
        idx_year = self.df[self.year_col].map(self.maps[self.year_col]).values
        for i in range(self.N):
            r = idx_region[i]; y = idx_year[i]
            counts_H_RY[r, y, :] += self.gamma[i, :]
        self.P_H_given_RY = normalize_rows(counts_H_RY, axis=2)

    def update_pollution_CPT(self):
        """
        Update P(Pollution | H)
        Pollution is observed, responsibilities provides the weighting.
        """
        counts_Pe_H = np.zeros((self.H, self.K_poll))
        idxPoll = self.df[self.pollution_col].map(self.maps[self.pollution_col]).values
        for i in range(self.N):
            p = idxPoll[i]
            counts_Pe_H[:, p] += self.gamma[i, :]
        self.P_Pe_given_H = normalize_rows(counts_Pe_H, axis=1)

    def update_species_CPT(self):
        """
        Update: P(Species | H, Pollution)
        Species is observed, conditioned on hidden state and pollution bucket(S)
        """
        counts_Sp_HPe = np.zeros((self.H, self.K_poll, self.K_species))
        idx_species = self.df[self.species_col].map(self.maps[self.species_col]).values
        idx_poll = self.df[self.pollution_col].map(self.maps[self.pollution_col]).values
        for i in range(self.N):
            p = idx_poll[i]
            s = idx_species[i]
            counts_Sp_HPe[:, p, s] += self.gamma[i, :]
        self.P_Sp_given_HPe = normalize_rows(counts_Sp_HPe, axis=2)

    def update_element_bucket_CPTs(self):
        """
        For each tissue element bucket node:
            Update: P(ElementBucket | Species, Pollution)
        Summed over hidden responsibilities since element buckets have no H parent
        """
        idx_species = self.df[self.species_col].map(self.maps[self.species_col]).values
        idx_poll = self.df[self.pollution_col].map(self.maps[self.pollution_col]).values
        N = self.N
        Kp = self.K_poll
        for col in self.elements:
            counts_elem = np.zeros((Kp, self.K_species, self.K_elem[col]))
            for i in range(N):
                p = idx_poll[i]
                s = idx_species[i]
                val = self.maps[col][self.df[col].iloc[i]]
                counts_elem[p, s, val] += 1.0
            self.P_elem[col] = normalize_rows(counts_elem, axis=2)

    def log_likelihood(self):
        """
        Observed-data log-likelihood:
        sum_i log P(observed_i) = sum_i log sum_h P(H|r,y) P(Pe|H) P(Sp|H) prod_k P(elem_k | Pe, Sp)
        If Pe is missing for some rows (not expected in training) we marginalize over Pe as well.
        """
        idx_region = self.df[self.region_col].map(self.maps[self.region_col]).values
        idx_year = self.df[self.year_col].map(self.maps[self.year_col]).values
        idx_species = self.df[self.species_col].map(self.maps[self.species_col]).values
        idx_poll = self.df[self.pollution_col].map(self.maps[self.pollution_col]).values

        total = 0.0
        for i in range(self.N):
            r = idx_region[i]; y = idx_year[i]; s = idx_species[i]
            # compute log evidence per h (if Pe observed)
            p_obs = idx_poll[i]
            log_ph = np.log(self.P_H_given_RY[r, y, :] + EPS)  # (H,)
            # sum element log-likelihood (depends on pe)
            if pd.notna(self.df[self.pollution_col].iloc[i]):
                # Pe observed
                log_ppe = np.log(self.P_Pe_given_H[:, p_obs] + EPS)
                log_psp = np.log(self.P_Sp_given_HPe[:, p_obs, s] + EPS)
                # element contributions (same for all h)
                log_elems = 0.0
                for col in self.elements:
                    val = self.maps[col][self.df[col].iloc[i]]
                    log_elems += np.log(self.P_elem[col][p_obs, s, val] + EPS)
                log_u = log_ph +log_ppe + log_psp + log_elems
                total += float(logsumexp(log_u, axis=0))
            else:
                # Pe missing
                pass
                # compute log_ppe[h,pe] + log_elem(pe) foreach var
                Kp = self.K_poll
                log_contrib = np.zeros((Kp, self.H))
                elem_vals = {col: self.maps[col][self.df[col].iloc[i]] for col in self.elements}
                for pe in range(Kp):
                    logppe = np.log(self.P_Pe_given_H[:, pe] + EPS)
                    log_elems = 0.0
                    for col in self.elements:
                        log_elems += np.log(self.P_elem[col][pe, s, elem_vals[col]] + EPS)
                    log_contrib[pe, :] = logppe + log_elems
                # for each h: log_p_h = log_ph + logsumexp_over_pe(log_contrib[:,h])
                log_sum_pe_per_h = logsumexp(log_contrib, axis=0)
                log_u_h = log_ph + log_sum_pe_per_h
                total += float(logsumexp(log_u_h, axis=0))
        return total

    def run(self, max_iters=100, tol=1e-6, verbose=True):
        """
        Full EM loop:
        initialize_random_CPTs()
        e_step()
        m_step()
        compute log likelihood and check tolerance
        Repeat.
        Returns learned CPT parameters.
        """
        lls= []
        prev_ll = -np.inf
        for it in range(1, max_iters+1):
            self.e_step()
            self.m_step()
            ll = self.log_likelihood()
            lls.append(ll)
            if verbose:
                print(f"Iter {it:3d}  ll = {ll:.6f}")
            if np.isfinite(prev_ll) and abs(ll - prev_ll) < tol:
                if verbose:
                    print("Converged (tol).")
                break
            prev_ll = ll
        plt.figure(figsize=(8,5))
        plt.plot(lls, marker='o')
        plt.title("EM Training Log-Likelihood")
        plt.xlabel("Iteration")
        plt.ylabel("Log-Likelihood")
        plt.grid(True)
        os.makedirs("plots", exist_ok=True)
        save_path = "plots/log_likelihood_curve.png"
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
        plt.close()

        return {
            'P_H_given_RY': self.P_H_given_RY,
            'P_Pe_given_H': self.P_Pe_given_H,
            'P_Sp_given_H': self.P_Sp_given_HPe,
            'P_elem': self.P_elem
        }

    def predict_pollution_distribution(self, row):
        """
        Compute P(Pollution | observed row)
        Uses:
            P(Pe | row) = sum_h P(Pe | H=h) * P(H=h | row)
        """
        # compute posterior P(H | row)
        p_h = self.compute_posterior_hidden_probs(row)
        p_pe = (p_h[:, None] * self.P_Pe_given_H).sum(axis=0)
        p_pe = p_pe / (p_pe.sum() + EPS)
        return p_pe
    def compute_global_hidden_prior(self):
        """
        Compute prior P(H) using dataset
        """
        idx_region = self.df[self.region_col].map(self.maps[self.region_col]).values
        idx_year   = self.df[self.year_col].map(self.maps[self.year_col]).values
        counts_RF = np.zeros((self.K_region, self.K_year))
        for i in range(self.N):
            counts_RF[idx_region[i], idx_year[i]] += 1
        p_RF = counts_RF / counts_RF.sum()

        # prior on H=sum_{R,F} P(H|R,F) * P(R,F)
        P_H = (self.P_H_given_RY * p_RF[...,None]).sum(axis=(0,1))
        P_H = P_H / P_H.sum()
        return P_H
    def predict_pollution_from_elem_species(self, row, P_H=None):
        """
        Compute P(Pe | Tissue data, Species)
        """
        if P_H is None:
            P_H = self.compute_global_hidden_prior()
        sp = self.maps[self.species_col][row[self.species_col]]
        # marginalize Pe, want it without region/field col. date
        Kp = self.K_poll
        scores = np.zeros(Kp)
        for pe in range(Kp):
            log_ps = np.log(self.P_Pe_given_H[:, pe] + EPS)
            log_el = 0.0
            for col in self.elements:
                val = self.maps[col][row[col]]
                log_el += np.log(self.P_elem[col][pe, sp, val] + EPS)
            x = np.log(P_H + EPS) + log_ps
            ls = logsumexp(x, axis=0)
            # P(Pe=pe, elems,sp) = sum_h P(H=h)*P(Pe|H=h)*P(elem|pe,sp)
            scores[pe] = ls + log_el
        scores = np.exp(scores - logsumexp(scores))
        return scores
    def print_CPTs(self, precision=4):
        """
        Print all CPTs with actual category labels instead of numeric indices.
        Structured so mapping to LaTeX tables is easy.
        """

        def fmt_prob(p):
            return f"{p:.{precision}f}"
        print("\n P(Region) ")
        for r in range(self.K_region):
            name = self.rev[self.region_col][r]
            print(f"{name}: {self.P_region[r]:.{precision}f}")

        # Prior P(Year)
        print("\n P(Year) ")
        for y in range(self.K_year):
            name = self.rev[self.year_col][y]
            print(f"{name}: {self.P_year[y]:.{precision}f}")
        # P(H | Region, Year)
        print("\n CPT: P(H | Region, Year)")    
        for r_idx, r in enumerate(self.rev[self.region_col]):
            for y_idx, y in enumerate(self.rev[self.year_col]):
                row = self.P_H_given_RY[r_idx, y_idx]
                probs = "  ".join([f"H={h}: {fmt_prob(row[h])}" for h in range(self.H)])
                print(f"Region={r:20s}  Year={y:20s}  ->  {probs}")

        #  P(Pollution | H) 
        print("\n CPT: P(Pollution | H) ")
        for h in range(self.H):
            row = self.P_Pe_given_H[h]
            for p_idx, p in enumerate(self.rev[self.pollution_col]):
                print(f"H={h}  Pollution={p:25s}  ->  {fmt_prob(row[p_idx])}")
            print()

        #  P(Species | H, Pollution) 
        print("\n CPT: P(Species | H, Pollution) ")
        for h in range(self.H):
            for p_idx, p in enumerate(self.rev[self.pollution_col]):
                row = self.P_Sp_given_HPe[h, p_idx]
                probs = "  ".join(
                    f"{self.rev[self.species_col][s_idx]}: {fmt_prob(row[s_idx])}"
                    for s_idx in range(self.K_species)
                )
                print(f"H={h}  Pollution={p:25s}  ->  {probs}")
            print()

        #  P(Element | Pollution, Species) 
        for col in self.elements:
            print(f"\n CPT: P({col} | Pollution, Species) ")
            for p_idx, p in enumerate(self.rev[self.pollution_col]):
                for s_idx, s in enumerate(self.rev[self.species_col]):
                    row = self.P_elem[col][p_idx, s_idx]
                    probs = "  ".join(
                        f"{self.rev[col][e_idx]}: {fmt_prob(row[e_idx])}"
                        for e_idx in range(self.K_elem[col])
                    )
                    print(f"Pollution={p:25s}  Species={s:25s}  ->  {probs}")
                print()


In [127]:
cols_we_keep = [
    'Nitrogen (% dw)_binned',
    'Sulfur (% dw)_binned',
    'Phosphorous (ppm dw)_binned',
    'Lead (ppm dw)_binned',
    'Copper (ppm dw)_binned',
    'Chromium (ppm dw)_binned',
    'Year of tissue collection_binned',
    'Air pollution score_binned',
    'Region_binned',
    'Code for scientific name and authority in lookup table_binned'
]
df_clean = df[cols_we_keep].dropna().reset_index(drop=True)

train_df, test_df = train_test_split(df_clean, test_size=0.2, random_state=42)
# train em
train_df = train_df.reset_index(drop=True)
model = BayesianNetworkEM(train_df,hidden_states=2, seed=0) 

model.run(max_iters=200, tol=1e-6)

# eval on test set
true_test_labels = test_df['Air pollution score_binned'].values
poll_mapping = model.maps[model.pollution_col]
predicted_labels = []

for _, row in test_df.iterrows():
    probs = model.predict_pollution_from_elem_species(row)
    # Pick most likely bucket
    pred_idx = np.argmax(probs)
    pred_label = model.rev[model.pollution_col][pred_idx]
    predicted_labels.append(pred_label)

predicted_labels = np.array(predicted_labels)

# accuracy score
accuracy = np.mean(predicted_labels == true_test_labels)
print(f"Pollution bucket prediction accuracy: {accuracy:.3f}")


Iter   1  ll = -45155.837438
Iter   2  ll = -45114.606336
Iter   3  ll = -45101.784938
Iter   4  ll = -45095.777810
Iter   5  ll = -45091.430278
Iter   6  ll = -45087.350140
Iter   7  ll = -45083.176200
Iter   8  ll = -45078.846012
Iter   9  ll = -45074.370626
Iter  10  ll = -45069.762442
Iter  11  ll = -45065.027561
Iter  12  ll = -45060.182063
Iter  13  ll = -45055.265722
Iter  14  ll = -45050.344138
Iter  15  ll = -45045.500704
Iter  16  ll = -45040.822670
Iter  17  ll = -45036.386428
Iter  18  ll = -45032.246837
Iter  19  ll = -45028.433153
Iter  20  ll = -45024.951045
Iter  21  ll = -45021.788143
Iter  22  ll = -45018.920453
Iter  23  ll = -45016.317915
Iter  24  ll = -45013.948448
Iter  25  ll = -45011.780539
Iter  26  ll = -45009.784699
Iter  27  ll = -45007.934152
Iter  28  ll = -45006.205059
Iter  29  ll = -45004.576470
Iter  30  ll = -45003.030147
Iter  31  ll = -45001.550330
Iter  32  ll = -45000.123491
Iter  33  ll = -44998.738087
Iter  34  ll = -44997.384346
Iter  35  ll =

In [128]:
model.print_CPTs(precision=4)


 P(Region) 
10: 0.0446
6: 0.9554

 P(Year) 
before 1995: 0.2201
1995-2005: 0.6141
2005-present: 0.1658

 CPT: P(H | Region, Year)
Region=10                    Year=before 1995           ->  H=0: 0.9998  H=1: 0.0002
Region=10                    Year=1995-2005             ->  H=0: 0.9995  H=1: 0.0005
Region=10                    Year=2005-present          ->  H=0: 0.9809  H=1: 0.0191
Region=6                     Year=before 1995           ->  H=0: 0.2061  H=1: 0.7939
Region=6                     Year=1995-2005             ->  H=0: 0.1377  H=1: 0.8623
Region=6                     Year=2005-present          ->  H=0: 0.1095  H=1: 0.8905

 CPT: P(Pollution | H) 
H=0  Pollution=low                        ->  0.8047
H=0  Pollution=medium                     ->  0.1276
H=0  Pollution=high                       ->  0.0677

H=1  Pollution=low                        ->  0.2222
H=1  Pollution=medium                     ->  0.3884
H=1  Pollution=high                       ->  0.3894


 CPT: P(Speci