In [None]:
!pip install -U sentence-transformers
!pip install evaluate

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import os 

from datetime import datetime
import sys
import csv


from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, InformationRetrievalEvaluator
from sentence_transformers.readers import InputExample
from sentence_transformers import util
import logging

import torch
from sklearn.model_selection import train_test_split
import evaluate 

import random
from tqdm import tqdm

In [4]:
original = pd.read_csv('/content/drive/MyDrive/diploma_kd/data/flights.csv')
original = original[['searchTerms', 'rank', 'snippet']]

In [5]:
original_dict = dict()
original_dict

{}

In [6]:
original

Unnamed: 0,searchTerms,rank,snippet
0,flights to hong kong,1,Book Cheap Flights to Hong Kong: Search and co...
1,flights to hong kong,2,Find & book great deals on Hong Kong (HKG) Fli...
2,flights to hong kong,3,Looking for cheap flights to Hong Kong SAR fro...
3,flights to hong kong,4,"Find flights to Hong Kong on China Eastern, As..."
4,flights to hong kong,5,Find cheap flights to Hong Kong Intl (HKG) in ...
...,...,...,...
3995,tickets to nice,6,"OGC Nice. The line-ups are in, it's almost tim..."
3996,tickets to nice,7,Answer 1 of 15: I will be in Nice for about a ...
3997,tickets to nice,8,Travel by train from Nice to Milan in 4h 44m. ...
3998,tickets to nice,9,NFC Nice Ticket is a mobile application which ...


In [7]:
for index, row in original.iterrows():
  key = str(row['searchTerms'])
  if  key not in original_dict.keys():
    original_dict.update({key: [[str(row['snippet'])], [float(row['rank'])/10.0]]})
  else:
    original_dict[key][0].append(str(row['snippet']))
    original_dict[key][1].append(float(row['rank'])/10.0)


# original_dict

In [8]:
samples = pd.read_csv('/content/drive/MyDrive/diploma_kd/data/train_sorted.csv')
samples

Unnamed: 0,ids,first,second
0,0,Book Cheap Flights to Hong Kong,Compare airfares on Tripadvisor
1,0,Search flights to Hong Kong,Find the best airline for your trip
2,0,Cheap flights to Hong Kong,Find the best airfare to Hong Kong
3,0,Cheapest airfares to Hong Kong,Find the best deals on flights to Hong Kong
4,0,Compare prices on flights to Hong Kong,Find the lowest airfare to Hong Kong
...,...,...,...
11220,3998,Métropole Nice Côte,NFC Nice Ticket Application
11221,3998,Mobile Application NFC Nice Ticket,Tickets Purchasing NFC Nice Ticket
11222,3998,Bus Travel Nice Métropole Côte,Tram Travel Nice Métropole Côte
11223,3998,Bus Travel Nice Métropole,Bus Subscriptions Nice Métropole


In [9]:
train_batch_size = 15
num_epochs = 4

model_name = 'bert-base-uncased'
model_save_path = '/content/drive/MyDrive/diploma_kd/output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])




device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device


device(type='cuda')

In [10]:
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)


model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
def get_cosinus_scores(model):
  cosinus_scores = []

  for query in tqdm(original_dict.keys()):
    query_embedding = model.encode(query, convert_to_tensor=True)
    docs_embeddings = model.encode(original_dict[query][0], convert_to_tensor=True)
    cosinus_scores.extend(util.cos_sim(query_embedding, docs_embeddings)[0].cpu().numpy())

  return cosinus_scores






In [14]:
cosinus_scores = get_cosinus_scores(model)

100%|██████████| 200/200 [00:17<00:00, 11.71it/s]


In [15]:
ground_truth = [ float(rank)/10.0 for rank in original['rank'].tolist()]

#NDCG ДО ОБУЧЕНИЯ

In [16]:
def get_ndcg(cosinus_scores, ground_truth):
  nDCG_metric = evaluate.load('JP-SystemsX/nDCG')
  return nDCG_metric.compute(references = [ground_truth], predictions = [cosinus_scores])

In [17]:
ndcg_before = get_ndcg(cosinus_scores, ground_truth)
ndcg_before

Downloading builder script:   0%|          | 0.00/5.22k [00:00<?, ?B/s]

{'nDCG': 0.9341260037194012}

In [18]:
logging.info("Read STSbenchmark train dataset")

train, test = train_test_split(samples, random_state=42, test_size=0.2)
train, val = train_test_split(train, random_state=42, test_size=0.25)


In [19]:
train

Unnamed: 0,ids,first,second
2413,903,Cheap flights to Lima (LIM) in 2019,Skyscanner's flight comparison tool
5067,1874,Flights to Jeju City on Korean Air,Flights from San Jose to Jeju City on Korean Air
4956,1838,Flight booking to Guangxi (KWL),Book the cheapest online tickets to Guilin
6490,2370,Airlines flying to Johor Bahru,Cheapest Airliners
9055,3220,Direct flights London to Barcelona,Direct flights Manchester to Barcelona
...,...,...,...
3323,1249,Delta flights to Shanghai,Discover Delta flights to Shanghai
1661,624,cheapflights to Greece,cheapairfare to Greece
8368,3003,Ticket prices and seat availability change,Which Airlines have the best ticket prices and...
3295,1238,Cheapest Fare of Mumbai to Pattaya Flight,37781.0 from Mumbai to Pattaya Flight


In [20]:
train_samples = [InputExample(texts=[str(row['first']), str(row['second'])], label=random.uniform(0.9, 1.0)) for index, row in train.iterrows()]
dev_samples = [InputExample(texts=[str(row['first']), str(row['second'])], label=random.uniform(0.9, 1.0)) for index, row in val.iterrows()]
test_samples = [InputExample(texts=[str(row['first']), str(row['second'])], label=random.uniform(0.9, 1.0)) for index, row in test.iterrows()]



In [21]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [23]:
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

In [24]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


In [25]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/449 [00:00<?, ?it/s]

Iteration:   0%|          | 0/449 [00:00<?, ?it/s]

Iteration:   0%|          | 0/449 [00:00<?, ?it/s]

Iteration:   0%|          | 0/449 [00:00<?, ?it/s]

In [26]:
dev_eval = pd.read_csv(model_save_path+'/eval/similarity_evaluation_sts-dev_results.csv')
dev_eval

Unnamed: 0,epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
0,0,100,0.028737,0.021444,0.022224,0.020592,0.021443,0.020114,0.037345,0.035492
1,0,200,0.026719,0.02279,0.021518,0.022357,0.021004,0.021871,0.027695,0.02383
2,0,300,0.026852,0.023517,0.020908,0.022161,0.020734,0.022099,0.035299,0.028933
3,0,400,0.02647,0.023617,0.019877,0.021723,0.019833,0.021651,0.040826,0.033905
4,0,-1,0.026091,0.02288,0.018861,0.02081,0.018551,0.0203,0.042817,0.036314
5,1,100,0.024485,0.022083,0.01655,0.019187,0.016161,0.018704,0.044376,0.038243
6,1,200,0.022314,0.020898,0.014647,0.017324,0.014333,0.017175,0.041595,0.035654
7,1,300,0.022273,0.020672,0.014814,0.016749,0.013882,0.015965,0.040371,0.034897
8,1,400,0.018649,0.016526,0.011816,0.013173,0.010998,0.012538,0.034724,0.030077
9,1,-1,0.017953,0.015563,0.010798,0.011985,0.009905,0.01148,0.034369,0.029274


In [27]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

0.028834361187265768

#NDCG ПОСЛЕ ОБУЧЕНИЯ

In [28]:
cos_scores_after = get_cosinus_scores(model)
ndcg_after = get_ndcg(cos_scores_after, ground_truth)
ndcg_after

100%|██████████| 200/200 [00:13<00:00, 15.32it/s]


{'nDCG': 0.9364612789500556}