In [2]:
import pandas as pd

# Dependency parsing and coreference resolution 
import spacy
from spacy import displacy

# Sentiment analysis 
from textblob import TextBlob
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# csv parsing 
import csv





[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manuelnunezmartinez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/manuelnunezmartinez/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


The function below is used to visualaize a depensncy tree using an nlp doc.

In [3]:
# Uses displacy to visualize a sentences dependency tree 

def visualize_dependency_tree(doc):
    
    # Visualize the dependency tree
    displacy.render(doc, style="dep")

The Function below is used to recoginize multi-word entities as one particular entity when dependency parsing. 

In [4]:
def retokenize_entities(doc):

    # Step 1: Identify Multi-Word Named Entities
    multi_word_entities = []
    for ent in doc.ents:
        if len(ent) > 1:
            multi_word_entities.append(ent)

    # Step 2: Merge Tokens
    for ent in multi_word_entities:
        with doc.retokenize() as retokenizer:
            start = ent.start
            end = ent.end
            retokenizer.merge(doc[start:end])

In the first function below we pass in a dictionary of adjectives/verbs as keys pointing at the nouns they describe, flipping that dictionary, and returning a dictionary of nouns as keys pointing at the adjectives describing/verbs them

The second function takes in as input a dictionary of nounc pointing to verbs/adjectives and returns same dictionary with the adjectives/verbs substitutes for their valence scors. 

In [5]:
def format_relations(relations):
    sent_dic = {}

    for key in relations:
        for ref in relations[key]:
            if ref not in sent_dic:
                sent_dic[ref] = [key]
            else:
                sent_dic[ref].append(key)

    for key, value in sent_dic.items():
        sent_dic[key] = sorted(value)

    return sent_dic


def vectorize_dic(sent_dic):
    for key in sent_dic:
        sentiment = 0
        for adj in sent_dic[key]:
            adj_blob = TextBlob(adj)
            adj_sentiment = adj_blob.sentiment.polarity
            sentiment += adj_sentiment
        sent_dic[key] = sentiment
    return sent_dic


The following function provides a simple approach to finding relations between adjectives and nouns. It was ultimatley not use given its limited scope. 

In [6]:
def extract_nouns(sentence):
    # Load SpaCy model
    nlp = spacy.load("en_core_web_sm")
    
    # Process the input sentence
    doc = nlp(sentence)
    
    # Initialize a dictionary to store nouns and their references
    noun_dict = {}
    
    # Iterate through each token in the sentence
    for token in doc:
        # Check if the token is a noun
        if token.pos_ == "NOUN":
            # If the noun is not in the dictionary, add it as a key
            if token.text not in noun_dict:
                noun_dict[token.text] = []
            # Iterate through children of the noun
            for child in token.children:
                # If the child is an adjective modifying the noun, add it to the list of references
                if child.dep_ == "amod":
                    noun_dict[token.text].append(child.text)
                # If the child is a possessive case modifying the noun, add it to the list of references
                elif child.dep_ == "poss":
                    noun_dict[token.text].append(child.text + "'s")
    
    return noun_dict

# Example sentence
sentence = "The cat chased the mouse. The mouse ran away."

# Extract nouns and their references
noun_references = extract_nouns(sentence)

# Print the dictionary
for noun, references in noun_references.items():
    print(f"{noun}: {references}")

  from .autonotebook import tqdm as notebook_tqdm
2024-03-26 18:44:50.129685: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cat: []
mouse: []


The set of functions below efficiently find the closest instance of a POS to another POS. 

In [6]:
def find_distances(sentence, from_pos, to_pos):   
    sent_dic = {}
    # Create memoization array 
    mem = [(-1, "")] * len(sentence)

    #mark closest predecessors 
    mark_down(sentence.root, to_pos, mem)
    
    #consider succesors and finalize dictionary 
    for token in sentence:
        if token.pos_ in from_pos:
            # cretae dictionary entry for token
            if token.text not in sent_dic:
                sent_dic[token.text] = []
            
            if token != sentence.root:
                mark_up(token, mem, token.head, 1)
            
            sent_dic[token.text].append(mem[token.i][1])
            

    return sent_dic


# check parent node recursivley as long as current distance from a noun isnt surpassed 
def mark_up(ref, mem, token, dist):
    closest = mem[token.i][0]
    if closest != -1 and (closest + dist < mem[ref.i][0] or mem[ref.i][0] == -1):
        mem[ref.i] = (mem[token.i][0] + dist, mem[token.i][1])
    
    if token.dep_ != "ROOT":
        mark_up(ref, mem, token.head, dist + 1)


# mark all entires in memoization array with closest noun succesor 
def mark_down(token, to_pos, mem):
    if token.pos_ in to_pos:
        mem[token.i] = (0, token.text)

    dist = (10000, "")
    for child in token.children:
        mark_down(child, to_pos, mem)
        closest = mem[child.i][0]
        if closest != -1 and closest < dist[0]:
            dist = (closest, mem[child.i][1])
        
    if dist[0] != 10000 and token.pos_ not in to_pos:
        update = (dist[0] + 1, dist[1])
        mem[token.i] = update
            

Testing the POS relation algorithm which simply relies on the closest relation. 

In [7]:
document = "Dear Media: Please, too, stop assuming the Democrats will lose the midterms."
# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")
# Process the input sentence
doc = nlp(document)
retokenize_entities(doc)

visualize_dependency_tree(doc)

for sentence in doc.sents:
    relations_adj = find_distances(sentence, ["ADJ"], ["NOUN", "PROPN"])
    relation_dic_adj = format_relations(relations_adj)
    print(relation_dic_adj)

    relations_verb = find_distances(sentence, ["VERB"], ["NOUN", "PROPN"])
    relation_dic_verb = format_relations(relations_verb)
    print(relation_dic_verb)

  from .autonotebook import tqdm as notebook_tqdm
2024-04-04 16:20:20.725495: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'Media': ['Dear']}
{}


IndexError: list assignment index out of range

The function below parses a csv containing sentences and the human annoated relations found between nouns and adjectives/verbs. 

In [9]:
def parse_csv(filename):
    data = []
    with open(filename, 'r', newline='') as csvfile:
        reader = csv.reader(csvfile)
        sentence = None
        relations = {}
        for row in reader:
            if not row:  # Empty line indicates new sentence
                if sentence is not None:
                    data.append((sentence, relations))
                    sentence = None
                    relations = {}
            else:
                if sentence is None:
                    sentence = ','.join(row)
                else:
                    row_str = ','.join(row)
                    noun, referenced_string = row_str.split(': ')
                    references = referenced_string.split(',')
                    relations[noun] = references
        if sentence is not None:  # Add the last sentence
            data.append((sentence, relations))
    return data


Below we test the POS refrence algorithm above to find precision and recall metrics. 

In [18]:
def test_POS_Reference(filepath, from_pos, to_pos):

    nlp = spacy.load("en_core_web_sm")
    parsed_data = parse_csv(filepath)

    total_present_relations = 0
    total_predicted_relations = 0
    total_correct_predictions = 0

    # for each sentence considered 
    for entry in parsed_data:
        # Process the input sentence
        doc = nlp(entry[0])
        retokenize_entities(doc)
        for sentence in doc.sents:
            #find and format relations through proximity 
            relations = find_distances(sentence, from_pos, to_pos)
            relation_dic = format_relations(relations)

            # add onto the total number of relations considered 
            for value in entry[1].values():
                total_present_relations += len(value)
            
            for value in relation_dic.values():
                total_predicted_relations += len(value)

            # check which relations were correctly predicted 
            correct = True
            for key in relation_dic:
                    if key in entry[1]:
                        for val in relation_dic[key]:
                            if val in entry[1][key]:
                                total_correct_predictions += 1
                            else:
                                # print(entry[0])
                                # print(f"Right Noun, Wrong Relation: {key} -> {val}")
                                # print()
                                correct = False
                    else:
                        # print(entry[0])
                        # print(f"Nonexistent Reference: {key} -> {relation_dic[key]}")
                        # print()
                        correct = False
            # visualize dependency tree if relations are incorrect 
            # if not correct:
            #     print("Relations found:",relation_dic)
            #     visualize_dependency_tree(doc)

    print("Percent of present relations outputted:", (total_correct_predictions/total_present_relations))
    print("Percent of correct predictions:", (total_correct_predictions/total_predicted_relations))


In [13]:
test_POS_Reference('POS_test_ADJ.csv', ['ADJ', 'ADV'], ['NOUN', 'PROPN'])


Percent of present relations outputted: 0.7887323943661971
Percent of correct predictions: 0.8235294117647058


In [19]:
test_POS_Reference('POS_test_VERB.csv', ['VERB'], ['NOUN', 'PROPN'])

Percent of present relations outputted: 0.6329113924050633
Percent of correct predictions: 0.684931506849315
