In [1]:
!pip install transformers
!pip install SPARQLWRAPPER

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 33.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 11.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [129]:
import os
import json
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from SPARQLWrapper import SPARQLWrapper, JSON
from transformers import BertTokenizer,BertForMaskedLM
from torch.utils.data import TensorDataset,DataLoader,SequentialSampler

In [122]:
def vocab_sim(type_embedding,WE_module,vocab):
    '''similairty with PLM vocabulary '''
    WE_matrix = WE_module.weight.detach()
    sim_scores = torch.nn.functional.cosine_similarity(WE_matrix,type_embedding.reshape(1,-1)).cpu().numpy().tolist()
    d = dict(zip(vocab,list(sim_scores)))
    sorted_scores = sorted(d.items(),key=lambda x: -x[1])
    return sorted_scores

def load_jsonl(file):
    data=[]
    with open(file,'r') as f:
        for line in f.readlines():
            data.append(json.loads(line))
    return data


sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

def get_PoB(subj_id):
  query = """
  SELECT ?obj ?objLabel WHERE
  {
      wd:!!SUBJ!! wdt:P19 ?obj .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  }
  """
  query=query.replace('!!SUBJ!!',subj_id)

  sparql.setQuery(query)
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()
  try:
    return results['results']['bindings'][0]['objLabel']['value']
  except:
    return None

In [123]:
GRE_relations = {
    
        "place_of_birth_test.jsonl":"[X] was born in [Y] .",

        "date_of_birth_test.jsonl":"[X] (born [Y]).",

        "place_of_death_test.jsonl":"[X] died in [Y] ."
}


model_arch = 'bert-base-cased'
model = BertForMaskedLM.from_pretrained(model_arch)
model.eval()
tokenizer = BertTokenizer.from_pretrained(model_arch)
vocab = list(tokenizer.get_vocab().keys())
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [131]:
df=[]
file = 'place_of_birth_test.jsonl'
concept_json = load_jsonl(file)
for x in tqdm(concept_json):
  subj_id = x['sub_w']
  subj = x['sub_label']
  if subj_id:
    obj_label = get_PoB(subj_id)
    if obj_label:
      df.append([subj,obj_label])

final_df = pd.DataFrame(df,columns=['SUBJ','OBJ']).drop_duplicates()
final_df['sent']=final_df.apply(lambda x:GRE_relations['date_of_birth_test.jsonl'].replace('[X]',x.SUBJ).replace('[Y]','[MASK]'),axis=1)
final_df.columns=['SUBJ','gold','sent']
final_df=final_df[final_df.gold.apply(lambda x: True if x in vocab else False)]

100%|██████████| 2937/2937 [01:19<00:00, 37.14it/s]


In [152]:
final_df.to_csv('DoB_prompts.csv',index=False)

In [153]:
!python MLM_Script.py --file 'DoB_prompts.csv'\
                      --model_arch 'bert-base-cased'\
                      --concept_vector 'TypeVectors/City_vectors.pkl'


  0% 0/50 [00:00<?, ?it/s]100% 50/50 [00:00<00:00, 994.62it/s]



k=0
100% 8/8 [00:06<00:00,  1.26it/s]
P@1:0.0
P@10:0.0
P@50:0.0
P@100:0.0
Processing Time:0.10822057723999023
Infer Time:6.346968412399292



k=1
100% 8/8 [00:03<00:00,  2.37it/s]
P@1:0.0
P@10:0.0
P@50:0.0
P@100:0.0
Processing Time:0.10822057723999023
Infer Time:3.379904270172119



k=2
100% 8/8 [00:03<00:00,  2.06it/s]
P@1:0.16
P@10:0.28
P@50:0.4
P@100:0.52
Processing Time:0.10822057723999023
Infer Time:3.8791086673736572



k=3
100% 8/8 [00:04<00:00,  1.81it/s]
P@1:0.2
P@10:0.38
P@50:0.54
P@100:0.6
Processing Time:0.10822057723999023
Infer Time:4.423843860626221



k=4
100% 8/8 [00:05<00:00,  1.58it/s]
P@1:0.22
P@10:0.38
P@50:0.54
P@100:0.72
Processing Time:0.10822057723999023
Infer Time:5.067090749740601



k=5
100% 8/8 [00:04<00:00,  1.64it/s]
P@1:0.22
P@10:0.44
P@50:0.56
P@100:0.74
Processing Time:0.10822057723999023
Infer Time:4.890537261962891


Optimal k: 5
100% 939/939 [00:01<00:00, 871.87it/s]
100% 12/12 [01:

In [155]:

with open('TypeVectors/City_vectors.pkl','rb') as f:
  city = pickle.load(f)

with open('TypeVectors/Year_vectors.pkl','rb') as f:
  year = pickle.load(f)

v1 = city['svd_vec']
v2 = year['svd_vec']
common = torch.dot(v1,v2)/(torch.norm(v1)*torch.norm(v2))
city['svd_vec']-=common*year['svd_vec']

with open('optim_City_vectors.pkl','wb') as f:
  pickle.dump(city,f)

In [156]:
!python MLM_Script.py --file 'DoB_prompts.csv'\
                      --model_arch 'bert-base-cased'\
                      --concept_vector 'optim_City_vectors.pkl'\
                      --manual\
                      --k 5

100% 939/939 [00:00<00:00, 1773.66it/s]
100% 12/12 [00:50<00:00,  4.22s/it]
P@1:0.18743343982960597
P@10:0.4440894568690096
P@50:0.617678381256656
P@100:0.7145899893503728
Processing Time:0.7427895069122314
Infer Time:50.659167766571045



In [159]:
final_df_PoB.to_csv('PoB_prompts.csv',index=False)

In [160]:
!python MLM_Script.py --file 'PoB_prompts.csv'\
                      --model_arch 'bert-base-cased'\
                      --concept_vector 'City_vectors.pkl'\
                      --manual\
                      --k 0

100% 939/939 [00:00<00:00, 1658.50it/s]
100% 12/12 [00:50<00:00,  4.21s/it]
P@1:0.24387646432374868
P@10:0.5282215122470714
P@50:0.7273695420660277
P@100:0.8072417465388712
Processing Time:0.7663583755493164
Infer Time:50.54681444168091

