# Mining Relations

### Use Sentence-BERT to get similar entities

In [1]:
# Install sentence_transformers
!pip install -qU transformers sentence-transformers

[K     |████████████████████████████████| 4.4 MB 27.2 MB/s 
[K     |████████████████████████████████| 85 kB 2.4 MB/s 
[K     |████████████████████████████████| 101 kB 11.8 MB/s 
[K     |████████████████████████████████| 596 kB 58.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 61.6 MB/s 
[K     |████████████████████████████████| 1.2 MB 39.3 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [2]:
from sentence_transformers import SentenceTransformer, util

In [3]:
# Load a pre-trained model
model = SentenceTransformer('nli-distilroberta-base-v2')

Downloading:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/679 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [4]:
# Use the model to find pair similarities
def find_similarity(model, term1, term2):
  # Compute embedding for both lists
  embeddings1 = model.encode([term1], convert_to_tensor=True)
  embeddings2 = model.encode([term2], convert_to_tensor=True)

  # Compute cosine-similarits
  cosine_score = util.pytorch_cos_sim(embeddings1, embeddings2).item()

  print(cosine_score)


In [5]:
find_similarity(model,"software developer", "software engineer")
find_similarity(model,"database designer", "data architect")
find_similarity(model,"welder", "solderer")


0.8500813245773315
0.6649540662765503
0.42287370562553406


In [18]:
   # Use the model to find similar pairs
   def find_top_similar(terms, model):
    paraphrases = util.paraphrase_mining(model, terms, show_progress_bar = True)
    for paraphrase in paraphrases[0:1000]:
      score, i, j = paraphrase
      print("{} \t\t {} \t\t Score: {:.4f}".format(terms[i], terms[j], score))

    return paraphrases

# New Section

In [7]:
find_top_similar(["data science", "data engineer", "welding", "soldering"],model)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

data science 		 data engineer 		 Score: 0.5785
welding 		 soldering 		 Score: 0.3617
data science 		 welding 		 Score: 0.2584
data engineer 		 welding 		 Score: 0.2475
data engineer 		 soldering 		 Score: 0.2307
data science 		 soldering 		 Score: 0.1960


[[0.5785418152809143, 0, 1],
 [0.36172375082969666, 2, 3],
 [0.258419394493103, 0, 2],
 [0.24753285944461823, 1, 2],
 [0.23072804510593414, 1, 3],
 [0.1959957778453827, 0, 3]]

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
# Read ESCO lexicalization data and split them into training, validation and test (file is available at https://drive.google.com/file/d/1qFrfabIYE_s5FFOyfpUAGhRjvC3JjGLK/view?usp=sharing)

import random
import csv
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator


esco_dataset_path = "/content/drive/MyDrive/esco_occupations_synonyms_data.tsv"

input_examples = []

csv_file = open(esco_dataset_path)
reader = csv.DictReader(csv_file, delimiter='\t', quoting=csv.QUOTE_NONE)
next(reader)
for row in reader:
  inp_example = InputExample(texts=[row['term1'], row['term2']], label=float(row["score"]))
  input_examples.append(inp_example)

random.shuffle(input_examples)

train_examples = input_examples[:int(0.6*len(input_examples))]
validation_examples = input_examples[int(0.6*len(input_examples)):int(0.8*len(input_examples))]
test_examples = input_examples[int(0.8*len(input_examples)):]


# Evaluate pre-trained model on the test data


In [33]:
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples, name='esco-test')
print(test_evaluator(model))


0.9926628385702552


### Adapt Sentence-BERT for specific relation with custom training examples

In [None]:
model_save_path = "/content/output/esco_occupations_adapted_training-nli-distilroberta-base-v2"

In [23]:
from torch.utils.data import DataLoader
import math

train_batch_size = 16
num_epochs = 4

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


# Development set: Measure correlation between cosine score and gold labels
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(validation_examples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

# Load the model to adapt
adapted_model = SentenceTransformer('nli-distilroberta-base-v2')

# Train the model
adapted_model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps)#,
          #output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/82 [00:00<?, ?it/s]

Iteration:   0%|          | 0/82 [00:00<?, ?it/s]

Iteration:   0%|          | 0/82 [00:00<?, ?it/s]

Iteration:   0%|          | 0/82 [00:00<?, ?it/s]

In [24]:
find_similarity(adapted_model, "software developer", "software engineer")
find_similarity(adapted_model, "database designer", "data architect")
find_similarity(adapted_model, "welder", "solderer")

0.8500813245773315
0.6649540662765503
0.42287370562553406


In [34]:
test_evaluator(adapted_model)

0.965231619393227

In [19]:
find_top_similar(["data science", "data engineer", "welding", "soldering"],adapted_model)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

data science 		 data engineer 		 Score: 0.5785
welding 		 soldering 		 Score: 0.3617
data science 		 welding 		 Score: 0.2584
data engineer 		 welding 		 Score: 0.2475
data engineer 		 soldering 		 Score: 0.2307
data science 		 soldering 		 Score: 0.1960


[[0.5785418152809143, 0, 1],
 [0.36172375082969666, 2, 3],
 [0.258419394493103, 0, 2],
 [0.24753285944461823, 1, 2],
 [0.23072804510593414, 1, 3],
 [0.1959957778453827, 0, 3]]