In [1]:
import os 
import re 
from itertools import islice

import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

SRC_PATH  = os.getcwd()
MAIN_PATH = os.path.dirname(SRC_PATH)
DATA_PATH = f"{MAIN_PATH}/data"
PER_PATH  = f"{DATA_PATH}/peranto_data"

In [11]:
def read_data(path=DATA_PATH):
    """
    Usage: data = read_data()

    Returns data from peranto/treebank data

    Input:
    -------
    path (str): path to data (you should use default)

    Output:
    -------
    dict(str, list(tuple(str))): dict mapping a name (str) to a list of s,v,o triples 

    {"treebank" : [('he', 'eat', 'food'), ...], 'Strength = 50, Discount = .5' : [(...)]}
    """
    def read_treebank():
        file_path = f"{DATA_PATH}/treebank.txt"
        
        with open(file_path, 'r') as f:
            content = f.readlines()
        
        # skip header/get triples
        triples = [tuple(line.strip().split('\t')) for line in content[1:]]
        return {"Treebank" : triples}


    def read_peranto():
        pattern = re.compile(r'peranto_s(\d+)_d(\d{2})\.txt')
        data = {}

        for file_name in os.listdir(PER_PATH): 
            file_path = os.path.join(PER_PATH, file_name)
            match     = pattern.match(file_name)
            triples   = []

            if match:
                s, d = map(int, match.groups())
                name = f"Strength = {s}, Discount = .{d}"
            else:
                continue

            with open(file_path, 'r') as f:
                for line in f:
                    # get rid of # ...
                    line = line.split('#')[0].strip()
                    # split by space
                    tokens = line.split()

                    if len(tokens) >= 3:
                        s,v,o = tokens[:3]
                        triples.append((s,v,o))

            data[name] = triples
        return data
    
    try:
        peranto_data = read_peranto()
    except Exception as e:
        print(f"Error with Peranto Scraping: {e}") 
        return None 
        
    try:
        treebank_data = read_treebank()
    except Exception as e:
        print(f"Error with Treebank Scraping: {e}")
        return None 

    peranto_data  = read_peranto()
    treebank_data = read_treebank()
    
    data = treebank_data | peranto_data
    return {name : lst for name, lst in data.items() if len(lst) == 5617}

data = read_data()

flochal
flochal


In [3]:
def compute_singletons(lst):
    """
    Computes the number of singletons for (S,V) and (V,O) for each index of the list.
    Returns bigrams and singletons for both (S,V) and (V,O).
    """
    
    # Initialize counts and sets to track unique bigrams
    singleton_sv_count = 0
    singleton_vo_count = 0
    sv_seen = set()
    vo_seen = set()
    
    # Lists to store results
    singleton_sv = []
    singleton_vo = []
    bigrams_sv = []
    bigrams_vo = []
    
    for idx, (s, v, o) in enumerate(lst):
        # Check for (S,V) singleton
        if (s, v) not in sv_seen:
            singleton_sv_count += 1
            sv_seen.add((s, v))
        singleton_sv.append(singleton_sv_count)
        
        # Check for (V,O) singleton
        if (v, o) not in vo_seen:
            singleton_vo_count += 1
            vo_seen.add((v, o))
        singleton_vo.append(singleton_vo_count)
        
        # Add to bigram counts
        bigrams_sv.append(idx + 1)
        bigrams_vo.append(idx + 1)
    
    return np.array(bigrams_sv), np.array(bigrams_vo), np.array(singleton_sv), np.array(singleton_vo)

def compute_singleton_proportion(bigrams, singletons):
    """
    Computes the singleton proportion for a given set of bigrams and singletons.
    """
    return singletons / bigrams

def mse(x, y):
    return sum((a - b)**2 for a,b in zip(x, y))

In [4]:
def compute_mse(data, choice):
    result = {}

    def helper(lst, choice):
        bigrams_sv, bigrams_vo, singleton_sv, singleton_vo = compute_singletons(lst)
        if choice == '(Subject, Verb)':
            singleton_prop = compute_singleton_proportion(bigrams_sv, singleton_sv)
        else:
            singleton_prop = compute_singleton_proportion(bigrams_vo, singleton_vo)
        
        return singleton_prop

    treebank_data = helper(data["Treebank"], choice)

    for name, lst in data.items():
        if name != "Treebank":
            peranto_data = helper(data[name], choice)
            loss = mse(treebank_data, peranto_data)
            result[name] = round(loss, 4)

    result = dict(sorted(result.items(), key=lambda item: item[1]))

    return result

data     = read_data()
mse_data = compute_mse(data, "(Subject, Verb)")

for name, loss in mse_data.items():
    print(f"{name} ==> {loss}")

Strength = 28, Discount = .20 ==> 2.6827
Strength = 32, Discount = .15 ==> 2.9806
Strength = 32, Discount = .20 ==> 3.116
Strength = 36, Discount = .15 ==> 3.1686
Strength = 20, Discount = .40 ==> 3.2311
Strength = 26, Discount = .30 ==> 3.2428
Strength = 34, Discount = .15 ==> 3.3006
Strength = 28, Discount = .25 ==> 3.306
Strength = 26, Discount = .20 ==> 3.6905
Strength = 24, Discount = .30 ==> 3.7273
Strength = 34, Discount = .25 ==> 3.9125
Strength = 22, Discount = .35 ==> 4.231
Strength = 30, Discount = .15 ==> 4.5519
Strength = 32, Discount = .10 ==> 4.7077
Strength = 26, Discount = .35 ==> 4.8755
Strength = 30, Discount = .20 ==> 7.0597
Strength = 24, Discount = .35 ==> 7.408
Strength = 34, Discount = .10 ==> 7.415
Strength = 28, Discount = .30 ==> 7.7919
Strength = 30, Discount = .30 ==> 8.0482
Strength = 24, Discount = .40 ==> 8.9811
Strength = 36, Discount = .20 ==> 9.9342
Strength = 28, Discount = .35 ==> 9.9705
Strength = 26, Discount = .25 ==> 10.3018
Strength = 20, Disco

In [7]:
def plot_singleton_curve(data, top_k=True, k=3, cutoff=False, mse_cutoff=5):
    """
    Plots the singleton proportion for dict mapping name to list of svo triples

    Usage:
        data = read_data()
        plot_singleton_curve(data)
    """
    def plot_curve(choice):
        local_data = data.copy()
        if top_k:
            mse_data = compute_mse(local_data, choice)
            mse_data = dict(islice(mse_data.items(), k))
            local_data = {name : lst for name, lst in local_data.items() if name == 'Treebank' or name in mse_data.keys()}

        if cutoff:
            mse_data = compute_mse(local_data, choice)
            mse_data = {name : loss for name, loss in mse_data.items() if loss <= mse_cutoff}
            local_data = {name : lst for name, lst in local_data.items() if name == 'Treebank' or name in mse_data.keys()} 
            
        plt.figure(figsize=(10, 6))
        
        for name, lst in local_data.items():
            bigrams_sv, bigrams_vo, singleton_sv, singleton_vo = compute_singletons(lst)
            if choice == '(Subject, Verb)':
                singleton_prop = compute_singleton_proportion(bigrams_sv, singleton_sv)
                x_vals = bigrams_sv
            else:
                singleton_prop = compute_singleton_proportion(bigrams_vo, singleton_vo)
                x_vals = bigrams_vo
            
            plt.plot(x_vals, singleton_prop, label=name)
        
        plt.title(f"Singleton Proportion of {choice} Bigrams")
        plt.xlabel("Number of Bigrams")
        plt.ylabel("Singleton Proportion")
        plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
        plt.grid(True)
        plt.show()
    
    # Create interactive widget
    widgets.interact(
        plot_curve, 
        choice=widgets.Dropdown(options=['(Subject, Verb)', '(Verb, Object)'], value='(Subject, Verb)', description='Choice:', disabled=False))

In [9]:
data = read_data()


In [10]:
plot_singleton_curve(data)

interactive(children=(Dropdown(description='Choice:', options=('(Subject, Verb)', '(Verb, Object)'), value='(S…

In [69]:
plot_singleton_curve(data, cutoff=True)

interactive(children=(Dropdown(description='Choice:', options=('(Subject, Verb)', '(Verb, Object)'), value='(S…