# Entity - Relation Extraction

In [1]:
!pip install transformers[torch] sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cufft_cu12

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('/content/drive/MyDrive/ml-workshop-2025/stock_headlines.csv')

In [5]:
data['Sentence']

Unnamed: 0,Sentence
0,"According to Gran , the company has no plans t..."
1,"For the last quarter of 2010 , Componenta 's n..."
2,"In the third quarter of 2010 , net sales incre..."
3,Operating profit rose to EUR 13.1 mn from EUR ...
4,"Operating profit totalled EUR 21.1 mn , up fro..."
...,...
108746,Philippines president Rodrigo Duterte urges pe...
108747,Spain arrests three Pakistanis accused of prom...
108748,"Venezuela, where anger over food shortages is ..."
108749,A Hindu temple worker has been killed by three...


In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
def extract_triplets_typed(text):
    triplets = []
    relation = ''
    text = text.strip()
    current = 'x'
    subject, relation, object_, object_type, subject_type = '','','','',''

    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").replace("tp_XX", "").replace("__en__", "").split():
        if token == "<triplet>" or token == "<relation>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
                relation = ''
            subject = ''
        elif token.startswith("<") and token.endswith(">"):
            if current == 't' or current == 'o':
                current = 's'
                if relation != '':
                    triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
                object_ = ''
                subject_type = token[1:-1]
            else:
                current = 'o'
                object_type = token[1:-1]
                relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '' and object_type != '' and subject_type != '':
        triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
    return triplets

In [8]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/mrebel-large", src_lang="en_XX", tgt_lang="tp_XX")
# Here we set English ("en_XX") as source language. To change the source language swap the first token of the input for your desired language or change to supported language. For catalan ("ca_XX") or greek ("el_EL") (not included in mBART pretraining) you need a workaround:
# tokenizer._src_lang = "ca_XX"
# tokenizer.cur_lang_code_id = tokenizer.convert_tokens_to_ids("ca_XX")
# tokenizer.set_src_lang_special_tokens("ca_XX")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-large")
gen_kwargs = {
    "max_length": 256,
    "length_penalty": 0,
    "num_beams": 3,
    "num_return_sequences": 3,
    "forced_bos_token_id": None,
}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/307 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

In [9]:
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250071, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250071, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=

In [10]:
# Text to extract triplets from
text = 'The Red Hot Chili Peppers were formed in Los Angeles by Kiedis, Flea, guitarist Hillel Slovak and drummer Jack Irons.'

# Tokenizer text
model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')

# Generate
generated_tokens = model.generate(
    model_inputs["input_ids"].to(model.device),
    attention_mask=model_inputs["attention_mask"].to(model.device),
    decoder_start_token_id = tokenizer.convert_tokens_to_ids("tp_XX"),
    **gen_kwargs,
)

# Extract text
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

In [11]:
decoded_preds

['tp_XX<triplet> Hillel Slovak <per> Red Hot Chili Peppers <org> part of <triplet> Jack Irons <per> Red Hot Chili Peppers <org> part of</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 'tp_XX<triplet> Kiedis <per> Red Hot Chili Peppers <org> part of <triplet> Flea <per> Red Hot Chili Peppers <org> part of <triplet> Hillel Slovak <per> Red Hot Chili Peppers <org> part of <triplet> Jack Irons <per> Red Hot Chili Peppers <org> part of</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 'tp_XX<triplet> Red Hot Chili Peppers <org> Los Angeles <loc> location of formation <triplet> Kiedis <per> Red Hot Chili Peppers <org> part of <triplet> Flea <per> Red Hot Chili Peppers <org> part of <triplet> Hillel Slovak <per> Red Hot Chili Peppers <org> part of <triplet> Jack Iro

In [12]:
result = []
for idx, sentence in enumerate(decoded_preds):
    result.extend(extract_triplets_typed(sentence))

In [13]:
result

[{'head': 'Hillel Slovak',
  'head_type': 'per',
  'type': 'part of',
  'tail': 'Red Hot Chili Peppers',
  'tail_type': 'org'},
 {'head': 'Jack Irons',
  'head_type': 'per',
  'type': 'part of',
  'tail': 'Red Hot Chili Peppers',
  'tail_type': 'org'},
 {'head': 'Kiedis',
  'head_type': 'per',
  'type': 'part of',
  'tail': 'Red Hot Chili Peppers',
  'tail_type': 'org'},
 {'head': 'Flea',
  'head_type': 'per',
  'type': 'part of',
  'tail': 'Red Hot Chili Peppers',
  'tail_type': 'org'},
 {'head': 'Hillel Slovak',
  'head_type': 'per',
  'type': 'part of',
  'tail': 'Red Hot Chili Peppers',
  'tail_type': 'org'},
 {'head': 'Jack Irons',
  'head_type': 'per',
  'type': 'part of',
  'tail': 'Red Hot Chili Peppers',
  'tail_type': 'org'},
 {'head': 'Red Hot Chili Peppers',
  'head_type': 'org',
  'type': 'location of formation',
  'tail': 'Los Angeles',
  'tail_type': 'loc'},
 {'head': 'Kiedis',
  'head_type': 'per',
  'type': 'part of',
  'tail': 'Red Hot Chili Peppers',
  'tail_type': '

In [14]:
kg = []

In [15]:
from tqdm import tqdm

In [16]:
def tokenize_and_extract_relations(batch):
  # Tokenizer text
  model_inputs = tokenizer(batch, max_length=256, padding=True, truncation=True, return_tensors = 'pt')

  # Generate
  generated_tokens = model.generate(
    model_inputs["input_ids"].to(model.device),
    attention_mask=model_inputs["attention_mask"].to(model.device),
    decoder_start_token_id = tokenizer.convert_tokens_to_ids("tp_XX"),
    **gen_kwargs,
  )

  # Extract text
  decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
  result = []
  for idx, sentence in enumerate(decoded_preds):
    result.extend(extract_triplets_typed(sentence))
  return result

In [17]:
sentences = list(data['Sentence'].values)

In [18]:
sentences

['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ',
 "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m ",
 'In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn ',
 'Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing 7.7 % of net sales ',
 'Operating profit totalled EUR 21.1 mn , up from EUR 18.6 mn in 2007 , representing 9.7 % of net sales ',
 'Finnish Talentum reports its operating profit increased to EUR 20.5 mn in 2005 from EUR 9.3 mn in 2004 , and net sales totaled EUR 103.3 mn , up from EUR 96.4 mn ',
 "Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004

In [19]:
kg = []

In [20]:
for i in tqdm(range(0, 1000, 32)):
  kg.extend(tokenize_and_extract_relations(sentences[i:i+32]))

100%|██████████| 32/32 [01:58<00:00,  3.71s/it]


In [21]:
knowledge_graph = pd.DataFrame(kg)

In [22]:
knowledge_graph

Unnamed: 0,head,head_type,type,tail,tail_type
0,Gran,org,country,Russia,loc
1,Russia,loc,item operated,Gran,org
2,Russia,loc,subsidiary,Gran,org
3,2010,time,point in time,2010,date
4,2010,concept,point in time,2010,date
...,...,...,...,...,...
3497,Video and Broadband Solutions,org,owned by,Teleste,org
3498,Teleste,org,industry,Video and Broadband Solutions,concept
3499,2008,time,point in time,2008,date
3500,financial statement for 2008,concept,point in time,2008,date


In [23]:
knowledge_graph = knowledge_graph.drop_duplicates(subset=['head', 'type', 'tail'], keep='first')

In [24]:
knowledge_graph

Unnamed: 0,head,head_type,type,tail,tail_type
0,Gran,org,country,Russia,loc
1,Russia,loc,item operated,Gran,org
2,Russia,loc,subsidiary,Gran,org
3,2010,time,point in time,2010,date
6,third quarter of 2010,time,point in time,2010,date
...,...,...,...,...,...
3495,ahlstrom.com,concept,instance of,website,concept
3496,Teleste,org,subsidiary,Video and Broadband Solutions,org
3497,Video and Broadband Solutions,org,owned by,Teleste,org
3498,Teleste,org,industry,Video and Broadband Solutions,concept


## Entity Resolution

In [25]:
from sentence_transformers import SentenceTransformer, util

In [26]:
def resolve_entities_by_text_embeddings(entities, embeddings):
  entity_mapping = {}
  for i in range(len(entities)):
    e1 = entities[i]
    for j in range(i + 1, len(entities)):
      e2 = entities[j]

      if e1 == e2:
        continue

      similarity = util.cos_sim(embeddings[i], embeddings[j])

      if similarity > 0.95:
        entity_mapping[entities[j]] = entities[i]
        entities[j] = entities[i]
        embeddings[j] = embeddings[i]

  return entity_mapping

In [27]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [28]:
embedding_model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [29]:
entities = list(set(knowledge_graph['head'].values + knowledge_graph['tail'].values))

In [30]:
len(entities)

1984

In [31]:
embeddings = embedding_model.encode(entities)

In [32]:
entities_resolution_dict = resolve_entities_by_text_embeddings(entities, embeddings)

In [33]:
entities_resolution_dict

{'Elcoteq SENasdaq OMX Helsinki Ltd.': 'ElcoteqNasdaq OMX Helsinki Ltd.',
 'EUR 0.27EUR 0.6': 'EUR 0.27EUR 0.186',
 'aviation fuelhydrocarbon': 'aviation fuelhydrocarbons',
 'aviation fuelshydrocarbons': 'aviation fuelhydrocarbons',
 'Okmetic Board of Directorsshare ownership': 'Okmetic Board of Directorsshare ownership program',
 'Currency conversionsexchange rate': 'Currency conversionsexchange rates',
 'Currency conversionexchange rate': 'Currency conversionsexchange rates',
 'agreementagreement': 'The agreementagreement',
 'active plaque psoriasisantibody': 'plaque psoriasisantibody',
 'euro735 millioneuro742.2 million': 'euro735 millioneuro742.2',
 'operating profitoperating': 'operating profitoperating profit',
 'passenger carscommercial vehicle': 'passenger carscommercial vehicles',
 'corresponding period of 20092009': 'corresponding period in 20092009',
 'winderMetso': 'windersMetso',
 'pulse rateblood-oxygen saturation': 'pulse rateblood-oxygen saturation level',
 '20082007': 

In [34]:
knowledge_graph = knowledge_graph.replace({"head": entities_resolution_dict, "tail": entities_resolution_dict})

In [35]:
knowledge_graph = knowledge_graph.drop_duplicates(subset=['head', 'type', 'tail'], keep='first')

In [36]:
import networkx as nx

In [37]:
graph = nx.Graph()

In [38]:
for i, row in knowledge_graph.iterrows():
  graph.add_edge(row['head'], row['tail'], label=row['type'])

In [39]:
components = [c for c in nx.connected_components(graph)]

In [40]:
len(components)

308

In [41]:
components[0]

{'set of figures',
 'Lahti Precision',
 'Q2 of 2008',
 'long-term partnership',
 'Iso Omena',
 'Eila Paatela',
 "Aspo 's Systems",
 'Teleste Oyj',
 "Finland 's Stockmann Plc",
 'bridge',
 'Major Order in India Comptel Corporation',
 'third generation mobile technology',
 'Q2 2009',
 'Ameriprice Inc.',
 'Le Lay',
 'N71',
 'ADPnews',
 'CapMan Technology',
 'Nordea',
 'steel components heating',
 'Swedish business',
 'Krister Kylas',
 'Ponsse Oyj HEL',
 'Aspocomp',
 'EUR11m',
 'This organization',
 'signed last summer',
 'Savon koulutuskuntayhtyma',
 'Cargotec',
 'EUR2m',
 'Net sales in 2007',
 'collection errors',
 'E70',
 'France T+®l+®com',
 'quarter',
 'state',
 'Forssa',
 'last year',
 'Raisio',
 'Componenta Oyj HEL',
 'Alandsbanken',
 'Lithuanian transport administration',
 'Elisa Oyj',
 'fibre-coaxial',
 'Orders received',
 'St Petersburg region',
 'product sales',
 'sawmill',
 'Okmetic',
 'Korean',
 'Raute Oyj',
 'newsprint',
 'reporting period',
 'Annual Report',
 'Defense Credit

In [42]:
subgraph = graph.subgraph(components[0])

In [43]:
len(subgraph)

1019

In [44]:
import matplotlib.pyplot as plt

In [45]:
layout = nx.spring_layout(subgraph, seed=42, k=0.9)
edge_labels = nx.get_edge_attributes(subgraph, 'label')
plt.figure(figsize=(15, 15))
nx.draw(subgraph, layout, with_labels=True, font_size=10, node_color='lightblue')
nx.draw_networkx_edge_labels(subgraph, layout, edge_labels=edge_labels, font_size=8)
plt.title('Knowledge Graph')
plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [46]:
knowledge_graph['head_type'].unique()

array(['org', 'loc', 'time', 'concept', 'eve', 'media', 'per', 'misc', ''],
      dtype=object)

In [47]:
knowledge_graph['tail_type'].unique()

array(['loc', 'org', 'date', 'concept', 'eve', 'media', 'misc', 'per',
       'num', 'time'], dtype=object)

In [48]:
knowledge_graph['type'].unique()

array(['country', 'item operated', 'subsidiary', 'point in time',
       'inception', 'manufacturer', 'industry', 'subclass of', 'facet of',
       'opposite of', 'instance of', 'owned by', 'follows', 'currency',
       'replaces', 'part of', 'measured physical quantity', 'start time',
       'publisher', 'author', 'educated at', 'country of citizenship',
       'country of origin', 'headquarters location', 'brand',
       'diplomatic relation', 'studies', 'end time',
       'chief executive officer', 'employer', 'director / manager',
       'unemployment rate', 'elevation above sea level',
       'number of parts of this work', 'duration', 'developer',
       'stock exchange', 'occupation', 'position held', 'model item',
       'operating area', 'business division', 'stock market index',
       'location', 'located in the administrative territorial entity',
       'founded by', 'use', 'residence', 'chairperson',
       'dissolved, abolished or demolished date',
       'date of officia