In [2]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd
from tqdm import tqdm

from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup, AdamW
from sklearn.model_selection import train_test_split
import torch
import os
import numpy as np
import random
from sklearn.metrics import accuracy_score
import json
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import CountVectorizer

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [5]:
model = torch.load('/content/drive/MyDrive/models_repeat/model_bert_best.pth', map_location='cpu')
raw_model = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(raw_model, do_lower_case=True)

In [6]:
from sentence_transformers import SentenceTransformer, models

## Step 1: use an existing language model
word_embedding_model = models.Transformer('distilroberta-base')

#word_embedding_model = models.Transformer('stsb-roberta-large')
## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

## Join steps 1 and 2 using the modules argument
model_cos = SentenceTransformer(modules=[word_embedding_model, pooling_model])

model_cos.load_state_dict(torch.load("/content/drive/MyDrive/models_repeat/model_state_cos_5_64", map_location=torch.device('cpu')))
#model_cos.eval()

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [7]:
model_cos = torch.load("/content/drive/MyDrive/models_repeat/model_cos_5_64", map_location=torch.device('cpu'))
#model_cos.eval()

In [8]:
def convert_to_dataset(data: pd.DataFrame) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["query"], row["name"], max_length=300,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    input_ids.to(dtype=torch.long)
    token_type_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)

    return TensorDataset(input_ids, attention_masks, token_type_ids)

In [9]:
def infer(dataloader, model):
    embs = []

    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
        input_ids, attention_masks, token_type_ids = batch

        input_ids = input_ids.to(dtype=torch.long)
        token_type_ids = token_type_ids.to(dtype=torch.long)
        attention_masks = attention_masks.to(dtype=torch.long)
        with torch.no_grad():
            m = (model(input_ids,
                       token_type_ids=token_type_ids,
                       attention_mask=attention_masks)).logits
            embs.append(torch.nn.Softmax()(m))
    return embs

In [10]:
df = pd.read_csv("/content/drive/MyDrive/train (2).csv")

In [15]:
all_comp = {}
for i, row in df.iterrows():
  all_comp[row["name_1"]] = {"emb": None}
  all_comp[row["name_2"]] = {"emb": None}

print(len(all_comp))
json.dump(all_comp, open("/content/drive/MyDrive/all_comp.json", "w"), indent=4)

18022


In [16]:
all_comp = json.load(open("/content/drive/MyDrive/all_comp.json"))

for sent, d in tqdm(all_comp.items(), total=len(all_comp)):
  emm = model_cos.encode([sent])
  all_comp[sent] = {"emb": emm[0]}

print(len(all_comp))
#json.dump(all_comp, open("/content/drive/MyDrive/all_comp.json", "w"), indent=4)

100%|██████████| 18022/18022 [02:30<00:00, 120.07it/s]

18022





In [36]:
def mesure_cos(target_company, all_comp_emb, n=10):
  target_emb = model_cos.encode(target_company)[0]
  output = pd.DataFrame(columns=['company_name', 'sim_score'])

  comp_labels = []
  comp_emb = []
  for comp, emd_dict in all_comp_emb.items():
      comp_labels.append(comp)
      comp_emb.append(emd_dict["emb"])
  
  out = cosine_similarity([target_emb], comp_emb)
  for i, score in enumerate(out[0]):
    output.loc[len(output)] = [comp_labels[i], score]
  
  output = output.sort_values(by=['sim_score'], ascending=False)

  print(output.head(10))

  best = output.iloc[:n+1]
  top_n = []
  for i, b in best.iterrows():
    top_n.append(b["company_name"])

  return top_n

In [None]:
# sentence_embeddings = model_cos.encode(sentences)
# cosine_similarity(
#     [sentence_embeddings[0]],
#     sentence_embeddings[1:]
# )

In [17]:
target_company = ["API"]

In [None]:
all_comp = json.load(open("/content/drive/MyDrive/all_comp.json")

In [37]:
output = mesure_cos(target_company, all_comp)


                                           company_name  sim_score
5651                                                API   1.000000
4996                                        Trinseo API   0.811521
4865                                             A.P.I.   0.793274
2343      A.P.I. Applicazioni Plastiche Industriali SPA   0.689926
6341                                    ASIA PHARM PACK   0.670014
16168                                   SIA"INTER SPED"   0.665998
9906                           Active Print & Promotion   0.663481
765                                                 APS   0.659733
6574   A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.   0.656669
6380                                Performance Machine   0.641626


In [38]:
output

['API',
 'Trinseo API',
 'A.P.I.',
 'A.P.I. Applicazioni Plastiche Industriali SPA',
 'ASIA PHARM PACK',
 'SIA"INTER SPED"',
 'Active Print & Promotion',
 'APS',
 'A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.',
 'Performance Machine',
 'Active Screw & Fastener']

In [41]:
target_company = [target_company[0] for i in range(len(output))]

In [42]:
df = pd.DataFrame({'query': target_company,
                   'name': output})

In [44]:
test = convert_to_dataset(df)
test_dataloader = DataLoader(test, sampler=SequentialSampler(test), batch_size=1)
emn = infer(test_dataloader, model)
ems = [i.detach().cpu().numpy() for i in emn]
em = [(ems[i][0][1]) for i, k in enumerate(ems)]
df['is_duplicate_score'] = pd.Series(em)
df = df.sort_values('is_duplicate_score', ascending=False)
print(df)

100%|██████████| 11/11 [00:00<00:00, 1247.36it/s]
  
Evaluating: 100%|██████████| 11/11 [00:13<00:00,  1.22s/batch]

   query                                              name  is_duplicate_score
1    API                                       Trinseo API            0.999756
0    API                                               API            0.999727
3    API     A.P.I. Applicazioni Plastiche Industriali SPA            0.995598
8    API  A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.            0.994212
2    API                                            A.P.I.            0.992988
5    API                                   SIA"INTER SPED"            0.038426
10   API                           Active Screw & Fastener            0.000190
7    API                                               APS            0.000081
9    API                               Performance Machine            0.000077
6    API                          Active Print & Promotion            0.000045
4    API                                   ASIA PHARM PACK            0.000026



