In [2]:
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [3]:
data = pd.read_pickle('element_analysis.pkl')

data.head()

Unnamed: 0,Nitrogen (% dw)_binned,Sulfur (% dw)_binned,Phosphorous (ppm dw)_binned,Lead (ppm dw)_binned,Copper (ppm dw)_binned,Chromium (ppm dw)_binned,Year of tissue collection_binned,Air pollution score_binned,Region_binned,Code for scientific name and authority in lookup table_binned
1,high,medium,medium,low,low,medium,before 1995,high,6,Species 4
3,medium,high,medium,high,high,high,before 1995,high,6,Species 3
5,medium,high,medium,high,medium,medium,1995-2005,medium,6,Species 5
8,high,medium,high,low,low,high,1995-2005,high,6,Other
10,low,high,high,low,high,high,1995-2005,high,6,Species 3


In [4]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [5]:
def count_occurrences(data):
    """
    Single pass through data to count all occurrences.
    :param data: pandas df of raw data
    :return: dicts of counts for each node
    """
    counts = {
        'Region': defaultdict(int),
        'Date': defaultdict(int),
        'Pollution': defaultdict(int),
        'Species': defaultdict(int),
        'Nitrogen': defaultdict(int),
        'Sulfur': defaultdict(int),
        'Phosphorus': defaultdict(int),
        'Lead': defaultdict(int),
        'Copper': defaultdict(int),
        'Chromium': defaultdict(int),
    }

    parent_counts = {
        'Pollution': defaultdict(int),
        'Species': defaultdict(int),
        'Nitrogen': defaultdict(int),
        'Sulfur': defaultdict(int),
        'Phosphorus': defaultdict(int),
        'Lead': defaultdict(int),
        'Copper': defaultdict(int),
        'Chromium': defaultdict(int),
    }

    for idx, row in data.iterrows():
        r = row['Region_binned']
        f = row['Year of tissue collection_binned']
        pe = row['Air pollution score_binned']
        sp = row['Code for scientific name and authority in lookup table_binned']
        N = row['Nitrogen (% dw)_binned']
        S = row['Sulfur (% dw)_binned']
        P = row['Phosphorous (ppm dw)_binned']
        Pb = row['Lead (ppm dw)_binned']
        Cu = row['Copper (ppm dw)_binned']
        Cr = row['Chromium (ppm dw)_binned']

        counts['Region'][r] += 1
        counts['Date'][f] += 1

        counts['Pollution'][(r, f, pe)] += 1
        parent_counts['Pollution'][(r, f)] += 1

        counts['Species'][(r, f, pe, sp)] += 1
        parent_counts['Species'][(r, f, pe)] += 1

        counts['Nitrogen'][(sp, pe, N)] += 1
        parent_counts['Nitrogen'][(sp, pe)] += 1

        counts['Sulfur'][(sp, pe, S)] += 1
        parent_counts['Sulfur'][(sp, pe)] += 1

        counts['Phosphorus'][(sp, pe, P)] += 1
        parent_counts['Phosphorus'][(sp, pe)] += 1

        counts['Lead'][(sp, pe, Pb)] += 1
        parent_counts['Lead'][(sp, pe)] += 1

        counts['Copper'][(sp, pe, Cu)] += 1
        parent_counts['Copper'][(sp, pe)] += 1

        counts['Chromium'][(sp, pe, Cr)] += 1
        parent_counts['Chromium'][(sp, pe)] += 1

    return counts, parent_counts

counts, parent_counts = count_occurrences(train_data)

In [6]:
def compute_CPTs(counts, parent_counts, N):
    """
    Compute CPTs from counts.
    :param counts: counts from count_occurrences
    :param parent_counts: parent_counts from count_occurrences
    :param N: size of dataset
    :return: CPTs as a dict of dicts containing the probabilities
    """

    CPTs = {}

    CPTs['Region'] = {}

    for r, count in counts['Region'].items():
        CPTs['Region'][r] = count / N

    CPTs['Date'] = {}
    for f, count in counts['Date'].items():
        CPTs['Date'][f] = count / N

    CPTs['Pollution'] = {}
    for (r, f, pe), count in counts['Pollution'].items():
        parent_count = parent_counts['Pollution'][(r, f)]
        CPTs['Pollution'][(r, f, pe)] = count / parent_count

    CPTs['Species'] = {}
    for (r, f, pe, sp), count in counts['Species'].items():
        parent_count = parent_counts['Species'][(r, f, pe)]
        CPTs['Species'][(r, f, pe, sp)] = count / parent_count

    CPTs['Nitrogen'] = {}
    for (sp, pe, N), count in counts['Nitrogen'].items():
        parent_count = parent_counts['Nitrogen'][(sp, pe)]
        CPTs['Nitrogen'][(sp, pe, N)] = count / parent_count

    CPTs['Sulfur'] = {}
    for (sp, pe, S), count in counts['Sulfur'].items():
        parent_count = parent_counts['Sulfur'][(sp, pe)]
        CPTs['Sulfur'][(sp, pe, S)] = count / parent_count

    CPTs['Phosphorus'] = {}
    for (sp, pe, P), count in counts['Phosphorus'].items():
        parent_count = parent_counts['Phosphorus'][(sp, pe)]
        CPTs['Phosphorus'][(sp, pe, P)] = count / parent_count

    CPTs['Lead'] = {}
    for (sp, pe, Pb), count in counts['Lead'].items():
        parent_count = parent_counts['Lead'][(sp, pe)]
        CPTs['Lead'][(sp, pe, Pb)] = count / parent_count

    CPTs['Copper'] = {}
    for (sp, pe, Cu), count in counts['Copper'].items():
        parent_count = parent_counts['Copper'][(sp, pe)]
        CPTs['Copper'][(sp, pe, Cu)] = count / parent_count

    CPTs['Chromium'] = {}
    for (sp, pe, Cr), count in counts['Chromium'].items():
        parent_count = parent_counts['Chromium'][(sp, pe)]
        CPTs['Chromium'][(sp, pe, Cr)] = count / parent_count

    return CPTs

CPTS = compute_CPTs(counts, parent_counts, len(train_data))

In [7]:
# Inference functions
def infer_pol_from_lichen(species, tissue_data, CPTs):
    """
    Infer pollution level from given observed species, and tissue data with Bayes rule and marginalization
    :param species: Species data
    :param tissue_data: Tissue data
    :param CPTs: CPTs from computeCPTs
    :return: dict of {pollution level: probability} for P(Pe | Sp, T)
    """
    posterior = defaultdict(float)

    pollution_levels = set(key[2] for key in CPTs['Pollution'].keys())

    for pe in pollution_levels:
        total_prob = 0.0

        for r in CPTs['Region'].keys():
            for f in CPTs['Date'].keys():
                
                p_r = CPTs['Region'][r]
                p_f = CPTs['Date'][f]

                # P(Pe | Region, Date)
                prob_pe_given_r_f = CPTs['Pollution'].get((r, f, pe), 0)

                # P(Species | Region, Date, Pe)
                prob_sp_given_r_f_pe = CPTs['Species'].get((r, f, pe, species), 0)
                
                # P(Tissues | Sp, Pe)
                p_tissues_given_sp_pe = 1.0
                for tissue_var, tissue_val in tissue_data.items():
                    p_tissues_given_sp_pe *= CPTs[tissue_var].get(
                        (species, pe, tissue_val), 1e-10
                    )
                
                total_prob += (p_r * p_f * prob_pe_given_r_f * prob_sp_given_r_f_pe * p_tissues_given_sp_pe)
        
        posterior[pe] = total_prob
    
    total = sum(posterior.values())
    if total > 0:
        return {pe: p/total for pe, p in posterior.items()}
    else:
        uniform = 1.0/len(pollution_levels)
        return {pe: uniform for pe in pollution_levels}

In [8]:
# Evaluation
def evaluate_pol_inference(test_data, CPTs):
    """
    Evaluate pollution inference accuracy from infer_pol_from_lichen
    :param test_data: test dataset
    :param CPTs: CPTs from computeCPTs
    :return: accuracy on test set
    """
    correct = 0
    total = 0

    for _, row in test_data.iterrows():

        region = row['Region_binned']
        date = row['Year of tissue collection_binned']
        species = row['Code for scientific name and authority in lookup table_binned']

        # tissue features for inference
        tissue_data = {
            'Nitrogen': row['Nitrogen (% dw)_binned'],
            'Sulfur': row['Sulfur (% dw)_binned'],
            'Phosphorus': row['Phosphorous (ppm dw)_binned'],
            'Lead': row['Lead (ppm dw)_binned'],
            'Copper': row['Copper (ppm dw)_binned'],
            'Chromium': row['Chromium (ppm dw)_binned'],
        }

        true_pe = row['Air pollution score_binned']

        posterior = infer_pol_from_lichen(species, tissue_data, CPTs)

        if len(posterior) == 0:
            continue

        pred_pe = None
        highest_prob = -1

        for pol_value, prob in posterior.items():
            if prob > highest_prob:
                highest_prob = prob
                pred_pe = pol_value

        if pred_pe == true_pe:
            correct += 1
        total += 1

    if total == 0:
        return 0.0

    return correct / total

In [9]:
evaluate_pol_inference(test_data, CPTS)

0.5222841225626741

In [10]:
#test high probability and low probability cases
sanity_check_data = data[data['Code for scientific name and authority in lookup table_binned']=='Species 5']
print(sanity_check_data['Air pollution score_binned'].value_counts())

infer_pol_from_lichen(species='Species 5',
                       tissue_data={
                           'Nitrogen': 'medium',
                           'Sulfur': 'medium',
                           'Phosphorus': 'medium',
                           'Lead': 'high',
                           'Copper': 'medium',
                           'Chromium': 'high',
                       },
                       CPTs=CPTS)

Air pollution score_binned
medium    708
high      609
low       526
Name: count, dtype: int64


{'medium': 0.4813189016040265,
 'high': 0.22863446066094106,
 'low': 0.2900466377350325}

In [11]:
print(sanity_check_data['Nitrogen (% dw)_binned'].value_counts())
print(sanity_check_data['Sulfur (% dw)_binned'].value_counts())
print(sanity_check_data['Phosphorous (ppm dw)_binned'].value_counts())
print(sanity_check_data['Lead (ppm dw)_binned'].value_counts())
print(sanity_check_data['Copper (ppm dw)_binned'].value_counts())
print(sanity_check_data['Chromium (ppm dw)_binned'].value_counts())

Nitrogen (% dw)_binned
medium    796
low       763
high      284
Name: count, dtype: int64
Sulfur (% dw)_binned
medium    763
low       595
high      485
Name: count, dtype: int64
Phosphorous (ppm dw)_binned
medium    934
high      490
low       419
Name: count, dtype: int64
Lead (ppm dw)_binned
high      930
medium    672
low       241
Name: count, dtype: int64
Copper (ppm dw)_binned
medium    922
high      718
low       203
Name: count, dtype: int64
Chromium (ppm dw)_binned
high      1002
medium     727
low        114
Name: count, dtype: int64


In [12]:
data['Code for scientific name and authority in lookup table_binned'].value_counts()

Code for scientific name and authority in lookup table_binned
Other        2226
Species 5    1843
Species 1    1630
Species 4     946
Species 3     533
Name: count, dtype: int64

In [27]:
CPTS['Chromium']

{('Species 5', 'low', 'medium'): 0.4513888888888889,
 ('Species 1', 'medium', 'medium'): 0.10467289719626169,
 ('Species 1', 'high', 'medium'): 0.16326530612244897,
 ('Species 4', 'medium', 'medium'): 0.44919786096256686,
 ('Other', 'high', 'high'): 0.5706293706293706,
 ('Other', 'medium', 'low'): 0.22287968441814596,
 ('Other', 'high', 'low'): 0.10909090909090909,
 ('Species 1', 'low', 'low'): 0.882661996497373,
 ('Species 4', 'high', 'high'): 0.24175824175824176,
 ('Species 5', 'medium', 'high'): 0.5211267605633803,
 ('Other', 'high', 'medium'): 0.3202797202797203,
 ('Species 1', 'medium', 'low'): 0.8654205607476636,
 ('Species 1', 'low', 'high'): 0.03327495621716287,
 ('Species 1', 'high', 'low'): 0.7806122448979592,
 ('Other', 'medium', 'medium'): 0.4280078895463511,
 ('Species 5', 'medium', 'medium'): 0.44190140845070425,
 ('Species 3', 'medium', 'high'): 0.44516129032258067,
 ('Species 5', 'high', 'high'): 0.668724279835391,
 ('Species 5', 'high', 'low'): 0.04526748971193416,
 ('

In [16]:
CPTS.keys()

dict_keys(['Region', 'Date', 'Pollution', 'Species', 'Nitrogen', 'Sulfur', 'Phosphorus', 'Lead', 'Copper', 'Chromium'])