Objective: Given a **verb**, a **valence** for the verb and a **corpus**, find the clusters of semantic type that fill the valence of the verb and return a probability distribution over the clusters.

We'll follow these steps:
1. Load the sub-obj pairs
2. Load the semantic types inventory
3. For each word (subject or object) in the sub-obj pairs find the corresponding semantic types
4. Create objects that hold sub-obj pairs and their number of occurrences

In [229]:
# import libraries
from typing import Tuple, Dict, List

import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset
import spacy

nltk.download('wordnet')

spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gianl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## 1. Load the sub-obj pairs

In [230]:
# open dataframe
df = pd.read_csv('res/see_sub_obj_pairs.csv')
df.head()

Unnamed: 0,subject,object
0,Republicans,it
1,You,signs
2,Democrats,opportunity
3,which,soldiers
4,they,approach


## 2. Load the semantic types inventory

In [231]:
# holds a mapping (wordnet id, wordnet pos) -> semantic type
wn_semantic_type_dict: Dict[Tuple[int, str], str] = {}

# open semantic types inventory
with open('res/st_inventory.txt', 'r') as f:
    # shorter version
    # wn_semantic_type_dict = {line.split('\t')[0]: line.split('\t')[1] for line in f.read().splitlines()[:10]}
    
    st_inventory = f.read().splitlines()
    # Each line is like this: wn:08641944n\tGEOGRAPHY_AND_PLACES_
    for line in st_inventory:
        l = line.split('\t')
        wn_id = l[0]
        # separate id in the form wn:08641944n to [08641944, n]
        wn_id_number, wn_id_pos = int(wn_id[3:-1]), wn_id[-1]
        semantic_type = l[1]
        wn_semantic_type_dict[(wn_id_number, wn_id_pos)] = semantic_type
        
# print first 10 entries
for i, (wn_id, st) in enumerate(wn_semantic_type_dict.items()):
    print(wn_id, st)
    if i == 10:
        break

(8641944, 'n') GEOGRAPHY_AND_PLACES_
(8950407, 'n') GEOGRAPHY_AND_PLACES_
(4502851, 'n') WARFARE_DEFENSE_AND_VIOLENCE_
(13742358, 'n') MATHEMATICS_
(13742573, 'n') MATHEMATICS_
(14930670, 'n') CHEMISTRY_AND_MINERALOGY_
(475142, 'n') SPORT_GAMES_AND_RECREATION_
(13746512, 'n') MATHEMATICS_
(13750415, 'n') MATHEMATICS_
(13750844, 'n') MATHEMATICS_
(13751265, 'n') MATHEMATICS_


In [232]:
# print semantic types as a set
print(set(wn_semantic_type_dict.values()))

{'SPACE_AND_TOUCH_', 'LANGUAGE_AND_LINGUISTICS_', 'COMPUTING_', 'LIQUID_AND_GAS_', 'SPORT_GAMES_AND_RECREATION_', 'BUSINESS_ECONOMICS_AND_FINANCE_', 'FOOD_DRINK_AND_TASTE_', 'GEOLOGY_AND_GEOPHYSICS_', 'CULTURE_ANTHROPOLOGY_AND_SOCIETY_', 'ART_ARCHITECTURE_AND_ARCHAEOLOGY_', 'MATHEMATICS_', 'MUSIC_SOUND_AND_DANCING_', 'METEOROLOGY_', 'PHILOSOPHY_PSYCHOLOGY_AND_BEHAVIOR_', 'FISHING_AND_HUNTING_', 'CRAFT_ENGINEERING_AND_TECHNOLOGY_', 'POLITICS_GOVERNMENT_AND_NOBILITY_', 'EMOTIONS_', 'EVALUATION_', 'ENVIRONMENT_', 'WARFARE_DEFENSE_AND_VIOLENCE_', 'BIOLOGY_', 'TIME_', 'RELIGION_MYSTICISM_AND_MYTHOLOGY_', 'GEOGRAPHY_AND_PLACES_', 'NUMISMATICS_AND_CURRENCIES_', 'COMMUNICATION_AND_TELECOMMUNICATION_', 'SEX_', 'CHEMISTRY_AND_MINERALOGY_', 'MEDIA_', 'TEXTILE_FASHION_AND_CLOTHING_', 'GENERAL_', 'LAW_AND_CRIME_', 'OLFACTORY_', 'VISUAL_', 'LITERATURE_AND_THEATRE_', 'FARMING_', 'HEALTH_AND_MEDICINE_', 'NAUTICAL_', 'CHORES_AND_ROUTINE_', 'PHYSICS_AND_ASTRONOMY_', 'EDUCATION_AND_SCIENCE_', 'TRANSPORT_

In [233]:
# retrieve wordnet synset from wordnet id
def get_synset_from_id(wn_id: Tuple[int, str]) -> Synset:
    """
    Given a wordnet id (id, pos), return the corresponding synset
    """
    return wn.synset_from_pos_and_offset(wn_id[1], wn_id[0])

# print first 10 synsets
for wn_id in list(wn_semantic_type_dict.keys())[:10]:
    print(get_synset_from_id(wn_id))

Synset(''hood.n.01')
Synset('the_hague.n.01')
Synset('twenty-two.n.02')
Synset('zero.n.02')
Synset('one.n.01')
Synset('lauryl_alcohol.n.01')
Synset('one-hitter.n.01')
Synset('ten.n.01')
Synset('hundred.n.01')
Synset('thousand.n.01')


In [234]:
# save list of synsets available
synset_list: List[Synset] = []
living_entity_pronouns = ["I", "you", "he", "she", "we", "they", "me", "you", "him", "her", "us", "them"]
object_pronouns = ["it"]

for wn_id in list(wn_semantic_type_dict.keys()):
    synset_list.append(get_synset_from_id(wn_id))

In [235]:
# for each word in the sub-obj pairs, find all their synsets
word_sem_type_dict: Dict[str, List[str]] = {}
lemmatizer = nltk.WordNetLemmatizer()

def get_common_nouns_sem_types(word) -> List[str]:
    """
    Given a word, retrieve all the synsets from st_inventory where the word is included, and return the corresponding semantic types
    """
    sem_types = []
    
    for synset in synset_list: # iterate synsets in the st_inventory
        if word in synset.lemma_names(): # if the word is in the synset
            # get the sem type from st inventory corresponding to the synset (identified by offset, pos)
            sem_type = wn_semantic_type_dict[(synset.offset(), synset.pos())]
            sem_types.append(sem_type)
    
    return sem_types

In [236]:
def get_pronoun_sem_types(pronoun: str) -> List[str]:
    """
    Given a pronoun, return the semantic types of the pronoun
    """
    if pronoun in living_entity_pronouns:
        return ["LIVING_ENTITY_"]
    elif pronoun in object_pronouns:
        return ["OBJECT_"]

In [237]:
def ner_single_word(word):
    # Process the word with spaCy
    doc = nlp(word)
    # Return the entity type if available
    return [(ent.text, ent.label_) for ent in doc.ents]

def get_named_entity_sem_types(named_entities) -> List[str]:
    """
    Given a named entity, return the semantic types of the named entity
    """
    # Retrieve the entity type if available
    for ent in named_entities:
        sem_type = ent[1]
        if sem_type == "ORG":
            return ["ORGANIZATION_"]
        elif sem_type == "GPE":
            return ["GEOGRAPHY_AND_PLACES_"]
        elif sem_type == "NORP":
            return ["NATIONALITIES_OR_RELIGIONS_OR_POLITICAL_GROUPS_"]
        else:
            return [sem_type]

In [238]:
def select_most_freq_sem_type(sem_types: List[str], n: int):
    """
    Given a list of semantic types, select the n most frequent semantic types
    """
    sem_type_freq = {}
    
    for sem_type in sem_types:
        if sem_type in sem_type_freq:
            sem_type_freq[sem_type] += 1
        else:
            sem_type_freq[sem_type] = 1
    
    # sort the semantic types by frequency, uses the frequency as the key
    sorted_sem_type_freq = sorted(sem_type_freq.items(), key=lambda x: x[1], reverse=True)
    # takes the first n semantic types from the sorted list
    return [sem_type for sem_type, freq in sorted_sem_type_freq[:n]]

In [239]:
unkonwn_words = set()

def manage_word(word: str):
    lemmatized_word = lemmatizer.lemmatize(word)

    if lemmatized_word not in word_sem_type_dict: # if the word is not already in the dictionary
        sem_type_found = False # flag to check if we can stop searching for semantic types
        sem_types = []
        
        # if the word is a pronoun
        if word.lower() in living_entity_pronouns + object_pronouns:
            sem_types = get_pronoun_sem_types(word.lower())
            sem_type_found = True
        
        named_entities = ner_single_word(word)
        if not sem_type_found and named_entities:
            sem_types = get_named_entity_sem_types(named_entities)
        elif not sem_type_found:
            sem_types = get_common_nouns_sem_types(lemmatized_word)
        
        if sem_types:
            frequent_sem_types = select_most_freq_sem_type(sem_types, 2)
            word_sem_type_dict[lemmatized_word] = frequent_sem_types
        else:
            unkonwn_words.add(word)

In [240]:
for i, row in df.iterrows():
    sub: str = row['subject']
    obj: str = row['object']
    
    manage_word(sub)
    manage_word(obj)

In [241]:
# print unknown words
print(unkonwn_words)

{'Hippodrome', 'which', 'Paree', 'Clerfayt', 'Poet', 'anyone', 'Helva', 'Seigner', 'anything', 'Juanita', 'Gun', "'s", 'Secretary', 'hovering', 'Prevot', 'circling', 'Some', 'Most', 'himself', 'something', 'Analysts', 'Crumb', 'that', 'Woman', 'realtor', 'Sparling', 'any', 'itself', 'oneself', 'Kaisers', 'who', 'others', 'Fountain', 'what', 'Ulyate', 'Rector', 'inscription', 'whatever', 'Negroes', 'Pieta', 'Littlepage', 'Bird', 'What', 'Teachers', 'whom', 'inculcation', 'Guardino', 'Grabski', 'Investors', 'Plains', 'themselves', 'Cousin'}


In [242]:
# print first 10 entries
for i, (word, synsets) in enumerate(word_sem_type_dict.items()):
    print(word, synsets)

Republicans ['NATIONALITIES_OR_RELIGIONS_OR_POLITICAL_GROUPS_']
it ['OBJECT_']
You ['LIVING_ENTITY_']
sign ['LANGUAGE_AND_LINGUISTICS_', 'RELIGION_MYSTICISM_AND_MYTHOLOGY_']
Democrats ['NATIONALITIES_OR_RELIGIONS_OR_POLITICAL_GROUPS_']
opportunity ['GENERAL_']
soldier ['WARFARE_DEFENSE_AND_VIOLENCE_', 'BIOLOGY_']
they ['LIVING_ENTITY_']
approach ['SEX_', 'PHILOSOPHY_PSYCHOLOGY_AND_BEHAVIOR_']
year ['HISTORY_', 'TIME_']
cleaning ['CULTURE_ANTHROPOLOGY_AND_SOCIETY_']
you ['LIVING_ENTITY_']
headline ['MEDIA_']
player ['BUSINESS_ECONOMICS_AND_FINANCE_', 'MUSIC_SOUND_AND_DANCING_']
film ['MEDIA_', 'VISUAL_']
sight ['VISUAL_', 'PHILOSOPHY_PSYCHOLOGY_AND_BEHAVIOR_']
I ['MATHEMATICS_', 'CHEMISTRY_AND_MINERALOGY_']
move ['SPACE_AND_TOUCH_', 'PHILOSOPHY_PSYCHOLOGY_AND_BEHAVIOR_']
Don ['PERSON']
chum ['CULTURE_ANTHROPOLOGY_AND_SOCIETY_', 'FISHING_AND_HUNTING_']
Kennedy ['PERSON']
value ['PHILOSOPHY_PSYCHOLOGY_AND_BEHAVIOR_', 'MATHEMATICS_']
we ['LIVING_ENTITY_']
member ['POLITICS_GOVERNMENT_AND_N