In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# get list of wordnet entities that need semantic universals
import json

with open("/content/drive/MyDrive/Colab Notebooks/llms4ol/wordnet/data/wordnet_train.json", "r") as json_data:
    wordnet_json = json.loads(json_data.read())
    json_data.close()

print(wordnet_json[0])

{'ID': '__land_reform_NN_1', 'term': 'land reform', 'type': 'noun', 'sentence': ''}


In [3]:
# get all wordnet entities
entities_list = [i['type'] for i in wordnet_json]
# remove all duplicate entities
entities = list(set(entities_list))
len(entities)
print(entities)

['adjective', 'verb', 'adverb', 'noun']


In [4]:
!pip install sparqlwrapper

Collecting sparqlwrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1 (from sparqlwrapper)
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isodate<0.7.0,>=0.6.0 (from rdflib>=6.1.1->sparqlwrapper)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib, sparqlwrapper
Successfully installed isodate-0.6.1 rdflib-7.0.0 sparqlwrapper-2.0.0


In [5]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [6]:
# get id of entity

import requests

def get_entity_id(entity):
  API_ENDPOINT = "https://www.wikidata.org/w/api.php"

  # get correct id for our entities
  if entity == "noun":
    return "Q1084" # for entity disambiguation
  elif entity == "verb":
    return "Q24905" # for entity disambiguation
  elif entity == "adjective":
    return "Q34698" # for entity disambiguation
  else:
    return "Q380057" # for entity disambiguation

  params = {
      'action': 'wbsearchentities',
      'format': 'json',
      'language': 'en',
      'search': query
  }

  r = requests.get(API_ENDPOINT, params = params)

  print("*** ENTITY: ", entity)
  print("*** DICT: ",r.json()['search'][0])
  entity_dict = r.json()['search'][0]
  return entity_dict['id']

In [7]:
# get semantic information for entity
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

def get_entity_semantic_information(entity_id):
  sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

  # entity id for 'mountain' retrieved from wikidata api
  #entity_id = 'wd:Q8502'

  # From https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples#Cats
  sparql.setQuery("""
SELECT ?item ?itemLabel ?subclass ?subclassLabel ?instance ?instanceLabel ?part ?partLabel ?category ?categoryLabel ?properties ?propertiesLabel ?represents ?representsLabel ?desc ?descLabel {
  VALUES (?item) { (wd:"""+entity_id+""") }
  VALUES ?missing { "n/a" }
  OPTIONAL { ?item wdt:P31 ?instance . }
  OPTIONAL { ?item wdt:P279 ?subclass . }
  OPTIONAL { ?item wdt:P361 ?part . }
  OPTIONAL { ?item wdt:P373 ?category . }
  OPTIONAL { ?item wdt:P1963 ?properties . }
  OPTIONAL { ?item wdt:P1268 ?represents . }
  OPTIONAL { ?item schema:description ?desc .
           FILTER(LANG(?desc) = "en") .}
  BIND(if(bound(?subclass)  , ?subclassLabel , ?missing)  as ?subclassLabel)
  BIND(if(bound(?instance)  , ?instanceLabel , ?missing)  as ?instanceLabel)
  BIND(if(bound(?part)  , ?partLabel , ?missing)  as ?partLabel)
  BIND(if(bound(?category)  , ?categoryLabel , ?missing)  as ?categoryLabel)
  BIND(if(bound(?properties)  , ?propertiesLabel , ?missing)  as ?propertiesLabel)
  BIND(if(bound(?represents)  , ?representsLabel , ?missing)  as ?representsLabel)
  BIND(if(bound(?desc)  , ?descLabel , ?missing)  as ?descLabel)
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
  """)
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()

  results_df = pd.json_normalize(results['results']['bindings'])
  #results_df[['itemLabel.value', 'subclassLabel.value', 'instanceLabel.value', 'partLabel.value', 'categoryLabel.value', 'propertiesLabel.value', 'descLabel.value', 'representsLabel.value']].head()
  return results_df

In [8]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# transform description field into noun groups to get semantic universal nouns for a given entity
def get_description_noun_groups(results_df):
  word_list = []
  for txt in results_df["descLabel.value"].values:
    word_list.append([word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(txt)) if pos[0] == 'N'])
  results_df["descLabel.noungroups"] = word_list
  results_df["descLabel.noungroups"] = [','.join(item) for item in results_df["descLabel.noungroups"]]
  results_df = results_df.apply(lambda x: x.astype(str).str.lower())
  #results_df.head(5)
  return results_df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [9]:
# generate semantic primes for entity
def generate_semantic_primes(results_df):
  results_df['primes'] = results_df['subclassLabel.value'] + ',' + results_df['instanceLabel.value'] + ',' + results_df['partLabel.value'] + ',' + results_df['categoryLabel.value'] + ',' + results_df['propertiesLabel.value'] + ',' + results_df['descLabel.value'] + ',' + results_df['descLabel.noungroups'] + ',' + results_df['representsLabel.value']
  results_df['primes'] = results_df['primes'].apply(lambda x: ','.join(list(set(x.split(',')))))
  #results_df.head(5)
  return results_df

In [10]:
# generate list of universal semantic primes for a given entity by removing empty string values and duplicates
def generate_universal_primes(results_df):
  primes_list = results_df['primes'].values
  primes_string = ','.join(primes_list)
  universal_primes = ','.join(list(set(list(filter(None, primes_string.split(','))))))
  universal_primes = universal_primes.replace('n/a,', '').replace(',n/a', '')
  return universal_primes

In [11]:
# create new dataframe with entity and universal semantic primes
def generate_all_entities_and_primes(entity_and_primes,entity, universal_primes):
  entity_and_primes.append([entity, universal_primes])
  return entity_and_primes

In [12]:
# compute sentence embeddings for semantic universal primes
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")


def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [13]:
entity_and_primes = []

for entity in entities:
  entity_id = get_entity_id(entity)
  results_df = get_entity_semantic_information(entity_id)
  results_df = get_description_noun_groups(results_df)
  results_df = generate_semantic_primes(results_df)
  universal_primes = generate_universal_primes(results_df)
  entity_and_primes = generate_all_entities_and_primes(entity_and_primes, entity, universal_primes)

wordnet_semantic_tower = pd.DataFrame(entity_and_primes, columns=['entity', 'primes'])

wordnet_semantic_tower["embedding"] = wordnet_semantic_tower["primes"].apply(get_embedding)

wordnet_semantic_tower.head(5)

Unnamed: 0,entity,primes,embedding
0,adjective,part of speech that describes a noun or pronou...,"[-0.0012560616014525294, -0.022227883338928223..."
1,verb,"state,verbs,occurrence,auxiliary verb, contain...","[-0.01781720668077469, -0.022800635546445847, ..."
2,adverb,"adjective,adverb,content word,part of speech,...","[-0.005304735153913498, -0.02259357087314129, ..."
3,noun,"substance,part of speech,set,objects,object,wo...","[-0.0038496803026646376, -0.01698467694222927,..."


In [14]:
# save final dataframe
wordnet_semantic_tower.to_csv('wordnet_semantic_primes.csv', index=False)