Objective: Given a **verb**, a **valence** for the verb and a **corpus**, find the clusters of semantic type that fill the valence of the verb and return a probability distribution over the clusters.

We'll follow these steps:
1. Load the sub-obj pairs
2. Load the semantic types inventory
3. For each word (subject or object) in the sub-obj pairs:
    1. Retrieve the synset of the subject and object
    2. Retrieve the semantic type of the synset
    3. Add the semantic type to the list of semantic types for the word
4. Create objects that hold sub-obj pairs and their number of occurrences

In [59]:
# import libraries
from typing import Tuple, Dict, Set, List

import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 1. Load the sub-obj pairs

In [60]:
# open dataframe
df = pd.read_csv('res/see_sub_obj_pairs.csv')
df.head()

Unnamed: 0,Subject,Object
0,Republicans,it
1,You,signs
2,Democrats,opportunity
3,which,soldiers
4,they,approach


## 2. Load the semantic types inventory

In [66]:
# holds a mapping (wordnet id, wordnet pos) -> semantic type
wn_semantic_type_dict: Dict[Tuple[int, str], str] = {}

# open semantic types inventory
with open('res/st_inventory.txt', 'r') as f:
    # shorter version
    # wn_semantic_type_dict = {line.split('\t')[0]: line.split('\t')[1] for line in f.read().splitlines()[:10]}
    
    st_inventory = f.read().splitlines()
    # Each line is like this: wn:08641944n\tGEOGRAPHY_AND_PLACES_
    for line in st_inventory:
        l = line.split('\t')
        wn_id = l[0]
        # separate id in the form wn:08641944n to [08641944, n]
        wn_id_number, wn_id_pos = int(wn_id[3:-1]), wn_id[-1]
        semantic_type = l[1]
        wn_semantic_type_dict[(wn_id_number, wn_id_pos)] = semantic_type
        
# print first 10 entries
for i, (wn_id, st) in enumerate(wn_semantic_type_dict.items()):
    print(wn_id, st)
    if i == 10:
        break

(8641944, 'n') GEOGRAPHY_AND_PLACES_
(8950407, 'n') GEOGRAPHY_AND_PLACES_
(4502851, 'n') WARFARE_DEFENSE_AND_VIOLENCE_
(13742358, 'n') MATHEMATICS_
(13742573, 'n') MATHEMATICS_
(14930670, 'n') CHEMISTRY_AND_MINERALOGY_
(475142, 'n') SPORT_GAMES_AND_RECREATION_
(13746512, 'n') MATHEMATICS_
(13750415, 'n') MATHEMATICS_
(13750844, 'n') MATHEMATICS_
(13751265, 'n') MATHEMATICS_


In [62]:
# retrieve wordnet synset from wordnet id
def get_synset_from_id(wn_id: Tuple[int, str]) -> Synset:
    """
    Given a wordnet id (id, pos), return the corresponding synset
    """
    return wn.synset_from_pos_and_offset(wn_id[1], wn_id[0])

# print first 10 synsets
for wn_id in list(wn_semantic_type_dict.keys())[:10]:
    print(get_synset_from_id(wn_id))

Synset(''hood.n.01')
Synset('the_hague.n.01')
Synset('twenty-two.n.02')
Synset('zero.n.02')
Synset('one.n.01')
Synset('lauryl_alcohol.n.01')
Synset('one-hitter.n.01')
Synset('ten.n.01')
Synset('hundred.n.01')
Synset('thousand.n.01')


In [ ]:
# save list of synsets available
synset_list: List[Synset] = []

for wn_id in list(wn_semantic_type_dict.keys()):
    synset_list.append(get_synset_from_id(wn_id))

In [ ]:
# for each word in the sub-obj pairs, find all their synsets
word_synset_dict = {}
lemmatizer = nltk.WordNetLemmatizer()

def add_word_synsets(word, word_synset_dict):
    """
    Given a word, retrieve all the synsets from st_inventory where the word is included, and add them to the word_synset_dict
    """
    # Lemmatize and retrieve synsets if not already done
    lemmatized_word = lemmatizer.lemmatize(word)
    if lemmatized_word not in word_synset_dict: # if the word is not already in the dictionary
        word_synset_dict[lemmatized_word] = []
        for synset in synset_list: # iterate synsets in the st_inventory
            if lemmatized_word in synset.lemma_names(): # if the word is in the synset
                word_synset_dict[lemmatized_word].append(synset) # add the synset to the list of synsets for the word

for i, row in df.iterrows():
    add_word_synsets(row['subject'], word_synset_dict)
    add_word_synsets(row['object'], word_synset_dict)