In [1]:
import pandas as pd
from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup, AdamW
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import os
import numpy as np
import random
from sklearn.metrics import accuracy_score, precision_recall_curve, classification_report, f1_score, precision_score, recall_score
import re
import string

In [2]:
df = pd.read_csv("../train.csv") #Сюда нужно вставить ссылку до данных на гугл-диске.

In [3]:
df.is_duplicate.value_counts()

0    494161
1      3658
Name: is_duplicate, dtype: int64

In [4]:
df0 = df[df['is_duplicate'] == 0].sample(25000)

In [5]:
df1 = df[df['is_duplicate'] == 1]

In [6]:
dfx = pd.concat([df0, df1], axis=0)

In [7]:
def seed_everything(seed = 1234):
     random.seed(seed)
     os.environ['PYTHONHASHSEED'] = str(seed)
     np.random.seed(seed)     
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.backends.cudnn.deterministic = True

In [8]:
seed_everything()

In [None]:
raw_model = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(raw_model, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(
    raw_model, 
    num_labels=2, 
    output_attentions=False,
    output_hidden_states=True, 
)

In [10]:
def convert_to_dataset_torch(data: pd.DataFrame, labels: pd.Series) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["name_1"], row["name_2"], max_length=300,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values)
    input_ids.to(dtype=torch.long)
    token_type_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)
    labels.to(dtype=torch.long)

    return TensorDataset(input_ids, attention_masks, token_type_ids, labels)

In [11]:
X_train, X_validation, y_train, y_validation = train_test_split(dfx[["name_1", "name_2"]],
                                                    dfx["is_duplicate"], test_size=0.1, random_state=45, stratify=dfx["is_duplicate"])

In [None]:
train = convert_to_dataset_torch(X_train, y_train)
validation = convert_to_dataset_torch(X_validation, y_validation)

In [13]:
batch_size = 8

In [14]:
train_dataloader = DataLoader(
            train,  
            sampler = RandomSampler(train),
            batch_size = batch_size,
            num_workers = 0,
            drop_last=True
        )


validation_dataloader = DataLoader(
            validation, 
            sampler = SequentialSampler(validation), 
            batch_size = batch_size, 
            num_workers = 0,
            drop_last=True
        )

In [None]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

In [16]:
epochs = 3
total_steps = len(train_dataloader) * epochs

In [17]:
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [18]:
DEVICE = 'cuda:0'

In [None]:
model.to(DEVICE)

In [20]:
def fit_batch(dataloader, model, optimizer, epoch):
    total_train_loss = 0

    for batch in tqdm(dataloader, desc=f"Training epoch:{epoch}", unit="batch"):
      
        input_ids, attention_masks, token_type_ids, labels = batch

        
        input_ids = input_ids.to(DEVICE)
        token_type_ids = token_type_ids.to(DEVICE)
        attention_masks = attention_masks.to(DEVICE)
        labels = labels.to(DEVICE)

        loss = (model(input_ids=input_ids,
                      token_type_ids=token_type_ids,
                      attention_mask=attention_masks,
                      labels=labels)).loss

        total_train_loss += loss
        optimizer.zero_grad()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    return total_train_loss

In [21]:
def eval_batch(dataloader, model, metric=accuracy_score):
    total_eval_accuracy = 0
    total_eval_loss = 0
    predictions, predicted_labels = [], []
    notright = []
    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
   
        input_ids, attention_masks, token_type_ids, labels = batch

        input_ids = input_ids.to(DEVICE)
        token_type_ids = token_type_ids.to(DEVICE)
        attention_masks = attention_masks.to(DEVICE)
        labels = labels.to(DEVICE)
        with torch.no_grad():
           
            m = (model(input_ids,
                       token_type_ids=token_type_ids,
                       attention_mask=attention_masks,
                       labels=labels))
        total_eval_loss += m.loss

        y_pred = np.argmax(m.logits.detach().cpu().numpy(), axis=1).flatten()
        total_eval_accuracy += metric(labels.cpu(), y_pred)

        predictions.extend(m.logits.detach().tolist())
        predicted_labels.extend(y_pred.tolist())
    return total_eval_accuracy, total_eval_loss, predictions, predicted_labels


In [22]:
def train(train_dataloader, validation_dataloader, model, optimizer, epochs):

    training_stats = []



    for epoch in range(0, epochs):

        
        total_train_loss = 0

        model.train()

        total_train_loss = fit_batch(train_dataloader, model, optimizer, epoch)

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"  Train Loss: {avg_train_loss}")
   
        model.eval()

        total_eval_accuracy, total_eval_loss, _, _ = eval_batch(validation_dataloader, model)
        FILE = 'model.pth'
        torch.save(model, FILE)
       
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)

        print(f"  Accuracy: {avg_val_accuracy}")

      
        avg_val_loss = total_eval_loss / len(validation_dataloader)


        print(f"  Validation Loss: {avg_val_loss}")

     
        training_stats.append(
            {
                'epoch': epoch,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
            }
        )

    print("")
    print("Training complete!")
    return training_stats

In [None]:
training_stats = train(train_dataloader, validation_dataloader, model, optimizer, epochs)

In [None]:
model = torch.load('../modelnew.pth') #Сюда нужно вставить ссылку на модель BERT с гугл-диска.

Validation on full set

In [24]:
X_train, X_validation, y_train, y_validation = train_test_split(df[["name_1", "name_2"]],
                                                    df["is_duplicate"], test_size=0.1, random_state=34, stratify=df["is_duplicate"])

In [None]:
validation = convert_to_dataset_torch(X_validation, y_validation)

In [26]:
validation_dataloader = DataLoader(
            validation, 
            sampler = SequentialSampler(validation), 
            batch_size = batch_size, 
            num_workers = 0,
            drop_last=True
        )

In [None]:
total_eval_accuracy, total_eval_loss, _, pred_labels = eval_batch(validation_dataloader, model)

In [29]:
y_validation = y_validation.iloc[:49776]

In [None]:
[i for i, j in zip(pred_labels, y_validation.tolist()) if i == j]

In [34]:
precision_score(y_validation.tolist(), pred_labels)

0.7176470588235294

In [33]:
recall_score(y_validation.tolist(), pred_labels)

1.0

In [35]:
f1_score(y_validation.tolist(), pred_labels)

0.8356164383561643

In [36]:
torch.save(model,'modelnew.pth')

In [37]:
df = pd.read_csv('../test.csv') #Сюда нужно вставить ссылку на тестовый датасет с гугл-диска.
df = df.sample(10000)
df.reset_index(drop=True, inplace=True)
df.drop('name_1', inplace=True, axis=1)

In [None]:
stri = 'Rootes'
df['Query'] = pd.Series()
df['Query'].fillna(stri, inplace=True)

In [39]:
def convert_to_dataset(data: pd.DataFrame) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["name_2"], row["Query"], max_length=300,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
   
    input_ids.to(dtype=torch.long)
    token_type_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)
 
    return TensorDataset(input_ids, attention_masks, token_type_ids)

In [52]:
X_test=df[['Query','name_2']]

In [None]:
test = convert_to_dataset(X_test)

In [54]:
test_dataloader = DataLoader(test, sampler=SequentialSampler(test), batch_size=1)

In [None]:
model.eval()

In [45]:
DEVICE = 'cuda:0'

In [None]:
model.to(DEVICE)

In [47]:
def infer(dataloader, model):
    total_eval_accuracy = 0
    total_eval_loss = 0
    embs=[]

    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
       
        input_ids, attention_masks, token_type_ids = batch

        
        input_ids = input_ids.to(DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
        attention_masks = attention_masks.to(DEVICE, dtype=torch.long)
        with torch.no_grad():
            m = (model(input_ids,
                       token_type_ids=token_type_ids,
                       attention_mask=attention_masks)).logits
            embs.append(torch.nn.Softmax()(m))
    return embs

In [None]:
emn = infer(test_dataloader, model)

In [56]:
ems = [i.detach().cpu().numpy() for i in emn]
em = [(ems[i][0][1]) for i, k in enumerate(ems)]
df['is_duplicate_score'] = pd.Series(em)
df.sort_values('is_duplicate_score', ascending=False)

Unnamed: 0,pair_id,name_2,Query,is_duplicate_score
1,30489,Rootes Parts Service,Rootes,0.999780
2725,208032,Ford,Rootes,0.019416
2638,152393,C.O.I.M. S.p.A.,Rootes,0.005205
4653,75743,Gap,Rootes,0.004706
4741,111005,Gap,Rootes,0.004706
...,...,...,...,...
218,135228,A One Techniques (Pvt) Ltd.,Rootes,0.000379
7743,158278,Mfg. & Assembly Solutions Of Monterrey S De Rl...,Rootes,0.000369
6299,115922,Mfg. & Assembly Solutions Of Monterrey S De Rl...,Rootes,0.000369
684,26606,Associate Industries Pvt.,Rootes,0.000366
