In [5]:
from conllu import parse_incr

# Path to your data
train_file = "../data/ud_ewt/en_ewt-ud-train.conllu"

# Store parsed sentences
sentences = []

# Parse the .conllu file
with open(train_file, "r", encoding="utf-8") as f:
    for tokenlist in parse_incr(f):
        words = []
        pos_tags = []
        
        for token in tokenlist:
            if isinstance(token['id'], int):  # Ignore multi-word tokens (e.g., 2-3)
                words.append(token['form'])
                pos_tags.append(token['upostag'])
        
        sentences.append({
            "words": words,
            "pos_tags": pos_tags
        })

print(f"Parsed {len(sentences)} sentences.")
print("Example:")
print(sentences[0])


FileNotFoundError: [Errno 2] No such file or directory: '../data/ud_ewt/en_ewt-ud-train.conllu'

In [None]:
# Define POS sets for features
function_words = {"ADP", "AUX", "CCONJ", "DET", "PART", "PRON", "SCONJ"}
noun_pos = {"NOUN", "PROPN"}
verb_pos = {"VERB", "AUX"}
closed_class = {"ADP", "AUX", "CCONJ", "DET", "PART", "PRON", "SCONJ", "PUNCT", "SYM"}

# Map POS tag to feature labels
def get_labels(pos_tags):
    labels = {
        "function_content": [],
        "noun_nonnoun": [],
        "verb_nonverb": [],
        "closed_open": []
    }
    
    for pos in pos_tags:
        labels["function_content"].append(1 if pos in function_words else 0)
        labels["noun_nonnoun"].append(1 if pos in noun_pos else 0)
        labels["verb_nonverb"].append(1 if pos in verb_pos else 0)
        labels["closed_open"].append(1 if pos in closed_class else 0)
    
    return labels


In [1]:
from transformers import GPT2Tokenizer
from tqdm import tqdm

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Important for batching later

# Store tokenized outputs
tokenized_sentences = []

for sent in tqdm(sentences):
    words = sent["words"]
    pos_tags = sent["pos_tags"]
    
    # Get binary labels for each linguistic feature
    feature_labels = get_labels(pos_tags)
    
    input_ids = []
    attention_mask = []
    
    # Separate label streams
    labels_function_content = []
    labels_noun_nonnoun = []
    labels_verb_nonverb = []
    labels_closed_open = []
    
    for word, fc_label, nn_label, vb_label, co_label in zip(
        words,
        feature_labels["function_content"],
        feature_labels["noun_nonnoun"],
        feature_labels["verb_nonverb"],
        feature_labels["closed_open"],
    ):
        # Tokenize the word individually
        word_tokens = tokenizer.tokenize(word)
        word_ids = tokenizer.convert_tokens_to_ids(word_tokens)
        
        if not word_ids:
            continue  # Skip empty or bad tokens
        
        input_ids.extend(word_ids)
        attention_mask.extend([1] * len(word_ids))
        
        # Label first subtoken; mask others with -100
        labels_function_content.append(fc_label)
        labels_function_content.extend([-100] * (len(word_ids) - 1))
        
        labels_noun_nonnoun.append(nn_label)
        labels_noun_nonnoun.extend([-100] * (len(word_ids) - 1))
        
        labels_verb_nonverb.append(vb_label)
        labels_verb_nonverb.extend([-100] * (len(word_ids) - 1))
        
        labels_closed_open.append(co_label)
        labels_closed_open.extend([-100] * (len(word_ids) - 1))
        
    # Store everything
    tokenized_sentences.append({
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels_function_content": labels_function_content,
        "labels_noun_nonnoun": labels_noun_nonnoun,
        "labels_verb_nonverb": labels_verb_nonverb,
        "labels_closed_open": labels_closed_open,
    })

print(f"Tokenized {len(tokenized_sentences)} sentences.")
print("Example:")
example = tokenized_sentences[0]
print(tokenizer.convert_ids_to_tokens(example["input_ids"]))
print(example["labels_function_content"])


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'sentences' is not defined