# Objective: Given a verb, a valence for the verb and a corpus, find the clusters of semantic type that fill the valence of the verb and return a probability distribution over the clusters.

First of all let's extract sentences that contain the verb "see". We'll use nltk to extract the sentences.

In [17]:
import nltk
from nltk.corpus import brown
import spacy

spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
nltk.download('brown')

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
⚠ Restart to reload dependencies
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [11]:
# Extract sentences that contain the verb "see"

sentences_with_see = []

# Search through the sentences in the Brown Corpus
for sentence in brown.sents():
    if "see" in sentence:
        sentences_with_see.append(" ".join(sentence))

print(sentences_with_see[:5])

["Barber , who is in his 13th year as a legislator , said there `` are some members of our congressional delegation in Washington who would like to see it ( the resolution ) passed '' .", 'It does not take a Gallup poll to find out that most Republicans in Congress feel this understates the situation as Republicans see it .', "He said , `` You can see signs of the Republicans' feeble attack on the Meyner administration .", "`` It's ' See Joe , see Jim ' '' , he says .", 'Barnett , as the titular head of the Democratic party , apparently must make the move to reestablish relations with the national Democratic party or see a movement come from the loyalist ranks to completely bypass him as a party functionary .']


In [18]:
def find_subject_object(sentence):
    # Process the sentence
    doc = nlp(sentence)
    
    for token in doc:
        # Find the verb "see"
        if token.lemma_ == "see":
            subject = None
            obj = None
            
            # Check if the token's head is our verb of interest
            for child in token.children:
                # Identify the subject
                if child.dep_ in ["nsubj", "nsubjpass"]:
                    subject = child.text
                # Identify the object
                elif child.dep_ in ["dobj", "pobj"]:
                    obj = child.text
            
            return subject, obj

    # Return None if the verb "see" is not found
    return None, None

# Example sentences
sentences = ["My dog sees the cat.", "You see him."]

for sentence in sentences:
    subject, obj = find_subject_object(sentence)
    if subject and obj:
        print(f"Sentence: \"{sentence}\" - Subject: {subject}, Object: {obj}")
    else:
        print(f"Sentence: \"{sentence}\" does not contain the verb 'see' with a clear subject and object.")

Sentence: "My dog sees the cat." - Subject: dog, Object: cat
Sentence: "You see him." - Subject: You, Object: him


In [24]:
# Extract sentences that contain the verb "see"
sub_obj_pairs = []

for sentence in sentences_with_see:
    sub, obj = find_subject_object(sentence)
    if sub and obj:
        sub_obj_pairs.append((sub, obj))

In [25]:
print(len(sub_obj_pairs))

for pair in sub_obj_pairs[:5]:
    print(pair)

200
('Republicans', 'it')
('You', 'signs')
('they', 'approach')
('year', 'cleaning')
('you', 'headline')
