In [2]:
from collections import defaultdict

import json
import nltk
import os

from fcfg_to_cfg import fcfg_to_cfg

In [3]:
# Convert FCFG to PCFG
feature_grammar_file = "ps6_grammar.fcfg"
cfg_grammar_file = "cfg_grammer.cfg"
pcfg_grammar_file = "pcfg_grammar.pcfg"

# Convert the FCFG grammar to CFG and PCFG and save them to files
fcfg_to_cfg(feature_grammar_file, cfg_grammar_file, pcfg_grammar_file)

Final sign_and_feature_to_values mapping:
Node: Node(symbol=DP, features=[{'NUM': '?b', 'STR': 'yes'}]), Index: 0, Key: STR, Values: {'yes'}
Node: Node(symbol=VP, features=[{'NUM': 'sg'}]), Index: 0, Key: NUM, Values: {'pl', 'sg'}
Node: Node(symbol=VP/NP, features=[{'NUM': 'sg'}]), Index: 0, Key: NUM, Values: {'pl', 'sg'}
Node: Node(symbol=Det, features=[{'STR': 'yes', 'NUM': 'sg'}]), Index: 0, Key: STR, Values: {'yes', 'no'}
Node: Node(symbol=Det, features=[{'STR': 'yes', 'NUM': 'sg'}]), Index: 0, Key: NUM, Values: {'pl', 'sg'}
Node: Node(symbol=Det, features=[{'STR': 'no'}]), Index: 0, Key: STR, Values: {'no'}
Node: Node(symbol=N, features=[{'NUM': 'sg'}]), Index: 0, Key: NUM, Values: {'pl', 'sg'}
Node: Node(symbol=NP, features=[{'NUM': 'sg'}]), Index: 0, Key: NUM, Values: {'pl', 'sg'}
Node: Node(symbol=TV, features=[{'NUM': 'sg'}]), Index: 0, Key: NUM, Values: {'pl', 'sg'}
Node: Node(symbol=Det, features=[{'STR': 'no', 'NUM': 'sg', 'PRED': 'no'}]), Index: 0, Key: STR, Values: {'no'}

In [None]:
# Year: 2021
transcripts_file = "21/transcription.c2"
data_json_file = "21/parsed_transcripts.json"

# Initialize dictionaries to store parsable transcripts and trees
# an example of the data entry would be where the key is the root_id
# and the value is a dictionary with the transcript_id as the key and the trees as the value
# "root_id": {
#     "1": [tree1, tree2, etc...],
#     "2": [tree1, tree2, etc...],
#     ...
#     "50": [tree1, tree2, etc...],
# }

data = {}

with open(pcfg_grammar_file, "r") as f:
    grammar = f.read()
    
with open(transcripts_file, "r") as f:
    transcripts = f.readlines()

parser = nltk.load_parser(pcfg_grammar_file, trace=0)
# Parse the transcripts
# and store the results in the data dictionary
count = 0
for line in transcripts:
    count += 1
    if count % 5000 == 0:
        print(f"Processed {count} lines...")
    if line.strip():
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            id, sentence = parts
            try:
                trees = parser.parse(sentence.lower().split())
                trees_list = list(trees)
                if (trees is not None) and (len(trees_list) > 0):
                    parts = id.split('-')
                    root_id = '-'.join(parts[:-1])
                    transcript_id = parts[-1]
                    if root_id not in data:
                        data[root_id] = {}
                    data[root_id][transcript_id] = trees_list
                else:
                    continue
            except Exception as e:
                print(f"Error parsing sentence '{sentence}': {e}")
        else:
            print("Skipping line in text file:", line)

# Save the data to a JSON file
with open(data_json_file, "w") as f:
    json.dump(data, f, indent=4)

Error parsing sentence 'AT LEAST ONE VOWEL IS CAPITAL A AT': tuple index out of range
Error parsing sentence 'AT LEAST ONE VOWEL IS CAPITAL A AND': tuple index out of range
Error parsing sentence 'AT LEAST ONE VOWEL IS CAPITAL A IS': tuple index out of range
Error parsing sentence 'AT LEAST ONE VOWEL IS CAPITALIZED': tuple index out of range
Error parsing sentence 'AT LEAST ONE VOWELS CAPITAL A AT': tuple index out of range
Error parsing sentence 'A AT LEAST ONE VOWEL IS CAPITAL A AT': tuple index out of range
Error parsing sentence 'AT LEAST ONE VOWEL IS CAPITAL AND': tuple index out of range
Error parsing sentence 'AT LEAST ONE VOWELS IS CAPITAL A AT': tuple index out of range
Error parsing sentence 'AT LEAST ONE VOWELS CAPITAL A AND': tuple index out of range
Error parsing sentence 'A AT LEAST ONE VOWEL IS CAPITAL A AND': tuple index out of range
Error parsing sentence 'I AT LEAST ONE VOWEL IS CAPITAL A AT': tuple index out of range
Error parsing sentence 'AT LEAST ONE VOWEL IS CAPI

In [1]:
import nltk
s = "SOME LETTER THAT PRECEDES LETTER FIVE IS VOCALIC"
# parse the sentence
parser = nltk.load_parser("ps6_grammar copy.fcfg", trace=0)
trees = parser.parse(s.lower().split())
# print the trees
for tree in trees:
    print(tree)

(S[SEM=<exists n.(exists c.char(n,c) & le(n,5) & exists c.(vowel(c) & char(n,c)))>]
  (DP[NUM='sg', PRED=?d, SEM=<\Q.exists n.(exists c.char(n,c) & le(n,5) & Q(n))>, STR='no']
    (Det[SEM=<\P Q.exists n.(P(n) & Q(n))>, STR='no'] some)
    (NP[NUM='sg', SEM=<\x.(exists c.char(x,c) & le(x,5))>]
      (N[NUM='sg', SEM=<\x.(exists c.char(x,c) & le(x,5))>]
        (N[NUM='sg', SEM=<\n.exists c.char(n,c)>] letter)
        (CP[NUM='sg', SEM=<\y.le(y,5)>]
          that
          (S[NUM='sg', SEM=<\y.le(y,5)>]/NP[NUM=?n]
            (NP[]/NP[] )
            (VP[NUM='sg', SEM=<\n.le(n,5)>]
              (TV[NUM='sg', SEM=<\m n.le(n,m)>] precedes)
              (NP[NUM='sg', SEM=<5>] letter five)))))))
  (VP[NUM='sg', SEM=<\n.exists c.(vowel(c) & char(n,c))>]
    is
    (A[SEM=<\n.exists c.(vowel(c) & char(n,c))>] vocalic)))
