In this notebook we do the pre processing necessary to implement a small application of Hanks valence theory.

The steps are:
1. Extract sentences that contain the verb "see" or one of its forms (saw, seen, ...).
2. For each sentence, extract the subject and object of the verb "see".
3. Save the subject-object pairs in a CSV file.

In [1]:
import nltk
from nltk.corpus import brown
import spacy

spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
nltk.download('brown')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

# 1. Extract sentences that contain the verb "see"

In [2]:
sentences_with_see = []
see_forms = ["see", "saw", "seen", "sees"]

# Search through the sentences in the Brown Corpus
for sentence in brown.sents():
    if any(form in sentence for form in see_forms):
        sentences_with_see.append(" ".join(sentence))

print(sentences_with_see[:5])

["`` This is one of the major items in the Fulton County general assistance program '' , the jury said , but the State Welfare Department `` has seen fit to distribute these funds through the welfare departments of all the counties in the state with the exception of Fulton County , which receives none of this money .", "Barber , who is in his 13th year as a legislator , said there `` are some members of our congressional delegation in Washington who would like to see it ( the resolution ) passed '' .", "-- After a long , hot controversy , Miller County has a new school superintendent , elected , as a policeman put it , in the `` coolest election I ever saw in this county '' .", "`` This was the coolest , calmest election I ever saw '' , Colquitt Policeman Tom Williams said .", 'It does not take a Gallup poll to find out that most Republicans in Congress feel this understates the situation as Republicans see it .']


In [3]:
def find_subject_object(sentence):
    # Process the sentence
    doc = nlp(sentence)
    
    for token in doc:
        # Find the verb "see"
        if token.lemma_ == "see":
            subject = None
            obj = None
            
            # Check if the token's head is our verb of interest
            for child in token.children:
                # Identify the subject
                if child.dep_ in ["nsubj", "nsubjpass"]:
                    subject = child.text
                # Identify the object
                elif child.dep_ in ["dobj", "pobj"]:
                    obj = child.text
            
            return subject, obj

    # Return None if the verb "see" is not found
    return None, None

# Example sentences
sentences = ["My dog sees the cat.", "You see him."]

for sentence in sentences:
    subject, obj = find_subject_object(sentence)
    if subject and obj:
        print(f"Sentence: \"{sentence}\" - Subject: {subject}, Object: {obj}")
    else:
        print(f"Sentence: \"{sentence}\" does not contain the verb 'see' with a clear subject and object.")

Sentence: "My dog sees the cat." - Subject: dog, Object: cat
Sentence: "You see him." - Subject: You, Object: him


# 2. Extract the subject and object of the verb "see" for each sentence

In [4]:
sub_obj_pairs = []

for sentence in sentences_with_see:
    sub, obj = find_subject_object(sentence)
    if sub and obj:
        sub_obj_pairs.append((sub, obj))

In [5]:
print(len(sub_obj_pairs))

for pair in sub_obj_pairs[:50]:
    print(pair)

476
('Republicans', 'it')
('You', 'signs')
('Democrats', 'opportunity')
('which', 'soldiers')
('they', 'approach')
('year', 'cleaning')
('you', 'headline')
('players', 'film')
('who', 'sights')
('I', 'it')
('Analysts', 'move')
('Don', 'chum')
('Kennedy', 'value')
('we', 'it')
('members', 'it')
('he', 'chance')
('Investors', 'element')
('settlers', 'them')
('regime', 'problems')
('who', 'justice')
('you', 'land')
('you', 'what')
('he', 'upturn')
('He', 'plan')
('Podger', 'feather')
('driver', 'turtle')
('who', 'eye')
('we', 'convicts')
('public', 'wagon')
('majority', 'party')
('you', 'situation')
('I', 'it')
('John', 'renewal')
('world', 'Catholicism')
('who', 'relation')
('We', 'nation')
('I', 'mention')
('We', 'enough')
('Chicago', 'Ballet')
('Russians', 'vitality')
('I', 'Fiorello')
('you', 'picture')
('Low', 'stupidities')
('You', 'tendency')
('I', 'Jouvet')
('who', 'Seigner')
('he', 'what')
('we', 'him')
('flower', 'indoors')
('Clerfayt', 'volcano')


# 3. Save the subject-object pairs in a CSV file

In [10]:
# save pairs found in res/see_sub_obj_pairs.csv
import csv

# newline="\n" is necessary to avoid blank lines between rows, otherwise the CSV writes new lines as "\r\n" because 
# this is the default new line character in Windows
with open("res/see_sub_obj_pairs.csv", "w", newline="\n") as f: 
    writer = csv.writer(f)
    writer.writerow(["Subject", "Object"])
    writer.writerows(sub_obj_pairs)