In [1]:
import nltk
from collections import defaultdict
import json
import string

In [2]:
# Helper functions
# Utilities
last_tree = None
last_formula = None

def parse_and_display(s, p, graphic:bool=True):
   global last_tree, last_formula
   for tree in p.parse(s.split()): 
    last_tree = tree
    if graphic: 
        display(tree) 
    else: print(tree)
    last_formula = last_tree.label()['SEM']
    if graphic:
        display(Markdown(f'${expr_to_latex(last_formula)}$'))
    else: print(last_formula)

def expr_to_latex(expr):
    """
    Recursively converts an nltk.Expression to a LaTeX string.
    """
    if isinstance(expr, nltk.sem.logic.ApplicationExpression):  # Function application (e.g., char(n, lett))
        # Extract function (predicate) and arguments
        arguments = []
        while isinstance(expr, nltk.sem.logic.ApplicationExpression):
            arguments.append(expr.argument)
            expr = expr.function  # Move up to the function name
        
        # Now expr is the function name (predicate), and arguments contains all its arguments
        function_name = rf"\mathbf{{{expr}}}"  # Boldface the predicate name
        arguments = ", ".join(expr_to_latex(arg) for arg in reversed(arguments))  # Reverse to maintain correct order
        return f"{function_name}({arguments})"
    elif isinstance(expr, nltk.sem.logic.LambdaExpression):  # Lambda abstraction (e.g., \x.P(x))
        return rf"\lambda {expr.variable} . {expr_to_latex(expr.term)}"
    elif isinstance(expr, nltk.sem.logic.QuantifiedExpression):  # Quantifiers (e.g., exists n. P(n))
        quantifier = r"\forall" if expr.getQuantifier() == "all" else r"\exists"
        return rf"{quantifier} {expr.variable} \, {expr_to_latex(expr.term)}"
    elif isinstance(expr, nltk.sem.logic.NegatedExpression):  # Negation (e.g., ¬P)
        return rf"\neg {expr_to_latex(expr.term)}"
    elif isinstance(expr, nltk.sem.logic.AndExpression):  # Conjunction (P & Q)
        return rf"({expr_to_latex(expr.first)} \wedge {expr_to_latex(expr.second)})"
    elif isinstance(expr, nltk.sem.logic.OrExpression):  # Disjunction (P | Q)
        return rf"({expr_to_latex(expr.first)} \vee {expr_to_latex(expr.second)})"
    elif isinstance(expr, nltk.sem.logic.ImpExpression):  # Implication (P -> Q)
        return rf"({expr_to_latex(expr.first)} \rightarrow {expr_to_latex(expr.second)})"
    elif isinstance(expr, nltk.sem.logic.BinaryExpression):  # Handles '<->' equivalence
        if expr.operator == '<->':
            return rf"({expr_to_latex(expr.first)} \leftrightarrow {expr_to_latex(expr.second)})"
        else:
            return rf"({expr_to_latex(expr.first)} {expr.operator} {expr_to_latex(expr.second)})"
    elif isinstance(expr, nltk.sem.logic.IndividualVariableExpression):  # Variables (e.g., x, y, n)
        return str(expr)
    elif isinstance(expr, nltk.sem.logic.ConstantExpression):  # Constants (e.g., lett, numbers)
        expr_str = str(expr)
        if expr_str.isdigit():  # If it's a number, don't boldface it
            return expr_str
        return rf"\mathbf{{{expr_str}}}"  # Boldface non-numeric constants like 'lett'
    else:
        return str(expr)  # Default case for any unhandled expression type
# Authors chatgpt4o and Mats Rooth Feb 8 2025

def emptysets(val:nltk.sem.evaluate.Valuation):
  val.update([(k,set()) for (k,v) in val.items() if v == 'set()'])

# Model construction
from typing import Callable, List, Set

def to_model_str(word: str, special_rels: List[Callable[[str], str]]=[]) -> str:
    """
    Creates the string form of the model for the input word. This string is meant to be passed to `nltk.Valuation.fromstring`.
    By default, the function will only add the relations mapping i => i for i from 1 to the length of `word` and a relation 
    mapping char => the set of tuples (i, word[i]). The `special_rels` function allows you to specify additional relations to 
    be added to the valuation string.
    
    :param word: The word to create a model string for.
    :param special_rels: A list of functions that when called return a string of the form {relation_name} => {relation_contents}. Defaults to the empty list.
    :returns: a string representing the model for word
    """
    n = len(word)
    model_str = []
    char = []
    for i in range(1, n+1):
        model_str.append(f'{i} => {i}')
        char.append((i, word[i-1]))
    model_str.append(f'char => {set(char)}'.lower())
    return '\n'.join(model_str + [rel(word) for rel in special_rels]).replace("'", "")
# Angela Liu

###This code is from CL1 2023

vowels = ['a', 'e', 'i', 'o', 'u']
fricatives = ['v', 'f', 's', 'z', 'h', 'th', 'sh', 'zh']

def capital(word, i):
    return word[i].isupper()

get_capital = lambda word: f'capital => {set([i+1 for i in range(len(word)) if capital(word,i)])}'

def less_than(i, j):
    return i<j
get_less_than = lambda word: f'le => {set([(i+1,j+1) for i in range(len(word)) for j in range(len(word)) if i!=j and less_than(i+1,j+1)])}'

def adjacent(i,j):
    return abs(i-j) == 1

get_adjacent = lambda word: f'ad => {set([(i+1,j+1) for i in range(len(word)) for j in range(len(word)) if adjacent(i+1,j+1)])}'

get_even =  lambda word: f'even => {set([i+1 for i in range(len(word)) if (i+1)%2==0])}'

get_odd = lambda word: f'odd => {set([i+1 for i in range(len(word)) if (i+1)%2!=0])}'

# @323
def voiced(word):
    word=word.lower()
    v=[]
    for i in range(len(word)):
        if i != len(word)-1:
            if (word[i] == 'n' and word[i+1]=='g') or (word[i] == 's' and word[i+1] == 'z'):
                v.append((i+1, word[i]))
                v.append((i+2, word[i+1]))
        if word[i] == 'z':
            if ((i,'s') not in v):
                v.append((i+1, 'z'))
        if word[i] == 'g':
            if ((i,'n') not in v):
                v.append((i+1, 'g'))
        if word[i] == 'n':
            if ((i+2,'g') not in v):
                v.append((i+1,'n'))
        if word[i] in ['a', 'e', 'i', 'o', 'u', 'b', 'd', 'j', 'l', 'm', 'r', 'v', 'w', 'y']:
            v.append((i+1, word[i]))
    return v
            
get_voiced= lambda w: f'voiced => {set([(i+1,w[i].lower()) for i in range(len(w)) if (i+1, w[i].lower()) in voiced(w)])}'

def centered(word,i):
    if (len(word)%2==0):
        return len(word)//2 == i or i == len(word)//2 + 1
    else:
        return len(word)//2 + 1 == i

get_centered = lambda word: f'cent => {set([i+1 for i in range(len(word)) if centered(word,i+1)])}'

get_mirrored = lambda w: f'mirrored => {set(i+1 for i in range(len(w)) if w[i].lower() == w[len(w)-1-i].lower())}'

get_glide =  lambda w: f'glide => {set(i+1 for i in range(len(w)) if w[i].lower() == "w" or w[i].lower() == "y")}'

# from @325
def fricatives(word):
    word.lower()
    frics = []
    for i in range(len(word)):
        if i != len(word)-1:
            if word[i] in ['t', 's', 'z'] and word[i+1] == 'h':
                frics.append((i+1, word[i]))
                frics.append((i+2, 'h'))
        if word[i] == 'h':
            if ((i, 't') not in frics) and ((i, 's') not in frics) and ((i, 'z') not in frics):
                frics.append((i+1, 'h'))
        if word[i] in ['s', 'z']:
            if ((i+2, 'h') not in frics):
                frics.append((i+1, word[i]))
        if word[i] in ['v', 'f']:
            frics.append((i+1, word[i]))
        return frics

get_fricative = lambda w: f'fricative => {set([(i+1,w[i].lower()) for i in range(len(w)) if (i+1, w[i].lower()) in fricatives(w)])}'


# get all the vowels in a word
get_vowel = lambda w: f'vowel => {set(re.findall(r"[AEIOUaeiou]", w))}'.lower()

# get all the consonants in a word
get_cons = lambda w: 'consonant => {}'.format(set(re.findall(r"[^AEIOUaeiouywYW\W0-9]", w))).lower()

# get all the tuple of two numbers (n,m) such that n < m, n&m < len(word) and n!=m
follows = lambda w: f'le => {set([(i+1,j+1) for i in range(len(w)) for j in range(i, len(w)) if i != j])}'

# get all the letters that are capitalized
get_capital = lambda w: 'capital => {}'.format(set(re.findall(r"[A-Z]",w))).lower()
# for A[SEM=<\n.exists c.(capital(n)& char(n,c))>] -> 'capitalized'

# get_capital = lambda w: f'capital => {set([m.span()[0] + 1 for m in re.finditer(r"[A-Z]", w)])}'

# get all the glides in a word
get_glide = lambda w: f'glide => {set(re.findall(r"[ywYW]", w))}'.lower()

# for exactly one
equals = lambda w: f'eq => {set([(i+1,i+1) for i in range(len(w))])}'

# get all alphabetical letters in a word
get_alphabet = lambda w: f'alphabet => {set(re.findall(r"[A-Za-z]", w))}'.lower()

# get all liquids
get_liquid = lambda w: f'liquid => {set(re.findall(r"[lrLR]", w))}'.lower()

# get all nasals
get_nasal = lambda w: f'nasal => {set(re.findall(r"[nmNM]", w))}'.lower()

# get all plosives
get_plosive = lambda w: f'plosive => {set(re.findall(r"[pbtdkgPBTDKG]", w))}'.lower()



letter_funcs = {
    f"let{ch}": (lambda c: lambda w: f"let{c} => {c}")(ch)
    for ch in string.ascii_lowercase
}

all_func = [
    follows, get_capital, get_vowel, equals, get_alphabet,
    get_adjacent, get_voiced, get_fricative, get_glide, get_centered,
    get_mirrored, get_less_than, get_even, get_odd,
    get_liquid, get_nasal, get_plosive, get_cons
] + list(letter_funcs.values())

In [16]:
# Problem 1
parsable_transcripts = defaultdict(list)
parsable_formulas = defaultdict(list)

parser = nltk.load_parser('ps6_grammar.fcfg', trace=0)
count = 0
with open("1000.txt", "r") as f:
    for line in f:
        count += 1
        if count % 5000 == 0:
            print(f"Processed {count} lines...")
        if line.strip():
            parts = line.strip().split(maxsplit=1)
            if len(parts) == 2:
                id, sentence = parts
                try:
                    trees = parser.parse(sentence.lower().split())
                    trees_list = list(trees)
                    if (trees is not None) and (trees_list > 0):
                        parts = id.split('-')
                        root_id = '-'.join(parts[:-1])
                        parsable_transcripts[root_id].append(id)
                        print(f"Parsing sentence: {sentence} with ID: {id}")
                        print(list(trees))
                        parsable_formulas[root_id].append(trees_list[0].label()['SEM'])
                    else:
                        continue
                except Exception as e:
                    print(f"Error parsing sentence '{sentence}': {e}")
            else:
                print("Skipping line in text file:", line)

try:
    with open("parable_transcripts.json", "r") as f:
        all_data = json.load(f)
        if not isinstance(all_data, list):
            all_data = []
except FileNotFoundError:
    all_data = []

for root_id, transcript_ids in parsable_transcripts.items():
    formulas = parsable_formulas[root_id]
    formulas = (lambda x: [str(f) for f in x])(formulas)
    
    data_entry = {
        "root_id": root_id,
        "transcript_ids": transcript_ids,
        "formulas": formulas
    }
    
    all_data.append(data_entry)
    
# Save to individual JSON file
with open("parable_transcripts.json", "w") as f:
    json.dump(all_data, f, indent=4)

Error parsing sentence 'AT LEAST ONE VOWEL IS CAPITALIZED': '>' not supported between instances of 'list' and 'int'
Error parsing sentence 'AT LEAST ONE VOWELS CAPITALIZED': '>' not supported between instances of 'list' and 'int'
Error parsing sentence 'THERE IS EXACTLY ONE THE T': '>' not supported between instances of 'list' and 'int'
Error parsing sentence 'THERE IS A EXACTLY ONE THE T': '>' not supported between instances of 'list' and 'int'
Error parsing sentence 'THERE IS THE EXACTLY ONE THE T': '>' not supported between instances of 'list' and 'int'
Error parsing sentence 'THERE IS EXACTLY ONE T': '>' not supported between instances of 'list' and 'int'
Error parsing sentence 'THERE IS EXACTLY ONE THAT T': '>' not supported between instances of 'list' and 'int'
Error parsing sentence 'THERE IS A EXACTLY ONE T': '>' not supported between instances of 'list' and 'int'
Error parsing sentence 'THERE IS THE EXACTLY ONE T': '>' not supported between instances of 'list' and 'int'
Error 

KeyboardInterrupt: 

In [None]:
# Problem 2
# Part A: Load the data from the JSON file
try:
    with open("parable_transcripts.json", "r") as f:
        all_data = json.load(f)
except FileNotFoundError:
    all_data = []
    
# Part B: Load the truth values
truth_values = {}
with open("pooled_truth", "r") as f:
    for line in f:
        if line.strip():
            parts = line.strip().lower().split()
            if len(parts) >= 2:
                id = parts[0]
                # The rest are word-value pairs
                word_values = []
                for i in range(1, len(parts), 2):
                    if i+1 < len(parts):
                        word = parts[i]
                        value = parts[i+1]
                        if value in ['t', 'f']:  # Skip 's' and 'u' values
                            word_values.append((word, value == 't'))
                truth_values[id] = word_values
  
# Part B/C: Compute the ratios
total_words = 0
correct_truths = 0

ids_over_85_percent = 0
total_ids = 0

for data_entry in all_data:
    root_id = data_entry["root_id"]
    transcript_ids = data_entry["transcript_ids"]
    formulas = data_entry["formulas"]
    
    transcript_formula_pairs = list(zip(transcript_ids, formulas))
    transcript_formula_pairs.sort(key=lambda x: x[0])  # Sort by transcript ID
    
    for transcript_id, formula in transcript_formula_pairs:
        if transcript_id in truth_values:
            word_values = truth_values[transcript_id]
            words = [w for w, v in word_values]
            truths = [v for w, v in word_values]

            vals = [nltk.Valuation.fromstring(to_model_str(w, all_func)) for w in words]
            for val in vals: emptysets(val)
            models = [nltk.Model(val.domain, val) for val in vals]
            assignments = [nltk.Assignment(val.domain) for val in vals]
            
            num_correct = 0
            total_truths = 0
            total_ids += 1

            for idx, truth in enumerate(truths):
                total_words += 1
                total_truths += 1
                try:
                    result = models[idx].evaluate(str(formula), assignments[idx])
                    if result == truth:
                        # If the evaluation matches the truth value, we can count it as correct
                        correct_truths += 1
                        num_correct += 1
                except Exception as e:
                    print(f"Error evaluating transript_id {root_id}-{transcript_id}")
            
            if total_truths > 0 and num_correct / total_truths >= 0.85:
                ids_over_85_percent += 1
                
      
# Proportion of truth value predictions that are correct
print(f"Total words evaluated: {total_words}")
print(f"Correct truth value predictions: {correct_truths}")
print(f"Proportion of correct predictions: {correct_truths / total_words if total_words > 0 else 0:.2f}")
# Proportion of IDs where all or almost all of the truth value predictions are correct. Pick an informative notion of
#   almost all, such as 80%.
print(f"Total IDs evaluated: {total_ids}")
print(f"IDs with >= 85% correct predictions: {ids_over_85_percent}")
print(f"Proportion of IDs with >= 85% correct predictions: {ids_over_85_percent / total_ids if total_ids > 0 else 0:.2f}")
# Some other statistic that you find interesting.
# 



Total words evaluated: 0
Correct truth value predictions: 0
Proportion of correct predictions: 0.00
Total IDs evaluated: 0
IDs with >= 85% correct predictions: 0
Proportion of IDs with >= 85% correct predictions: 0.00
