In [1]:
import pandas as pd
from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup, AdamW
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import os
import numpy as np
import random
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("../input/TrainDuplicates/traincleaned.csv")

In [3]:
df.sample(15)

Unnamed: 0.1,Unnamed: 0,name_1,name_2,is_duplicate
4798,5488,ООО Руспласт,"РУСПЛАСТ, ООО (ЛОБАНОВО)",1
1264,688,Dawn Imp. & Exp.,"Ningbo Green Was Imp. & Exp. . Co., Ltd. Was C...",0
4911,5618,Fenner USA,"Fenner Precision, Inc.",1
1986,2758,Fms Logistics Mexico S De Rl De Cv,Epic,0
5298,6067,Bridgestone India Automotive Products Private ...,Bridgestone India,1
4333,4955,K Flex Usa,Planika Flex D.O.O,1
3296,3774,"Brenntag Vietnam Co., Ltd.",Brenntag Ingredients Inc.,1
4772,5458,Sumitomo Industrias Pesadas Do Brasil Ltda,Sumitomo Rubber Industries Usa,1
909,1497,"Commerce Logistics (Shanghai) Co., Ltd.","Bi Link(Shanghai) Co., Ltd.",0
4549,5202,Bridgestone De Costa Rica S.A.,Bridgestone De Mexico S.A. De C.V.,1


In [4]:
def seed_everything(seed = 1234):
     random.seed(seed)
     os.environ['PYTHONHASHSEED'] = str(seed)
     np.random.seed(seed)     
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.backends.cudnn.deterministic = True

In [5]:
seed_everything()

In [None]:
raw_model = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(raw_model, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(
    raw_model, 
    num_labels=2, 
    output_attentions=False,
    output_hidden_states=True, 
)

In [7]:
def convert_to_dataset_torch(data: pd.DataFrame, labels: pd.Series) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["name_1"], row["name_2"], max_length=300,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, return_tensors='pt', truncation=True)
        # Add the encoded sentences to the list.
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.

    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values)
    input_ids.to(dtype=torch.long)
    token_type_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)
    labels.to(dtype=torch.long)

    return TensorDataset(input_ids, attention_masks, token_type_ids, labels)

In [8]:
X_train, X_validation, y_train, y_validation = train_test_split(df[["name_1", "name_2"]],
                                                    df["is_duplicate"], test_size=0.3, random_state=42, stratify=df["is_duplicate"])

In [9]:
train = convert_to_dataset_torch(X_train, y_train)
validation = convert_to_dataset_torch(X_validation, y_validation)

100%|██████████| 4475/4475 [00:02<00:00, 1637.58it/s]
100%|██████████| 1919/1919 [00:01<00:00, 1414.18it/s]


In [10]:
batch_size = 8

In [11]:
train_dataloader = DataLoader(
            train,  
            sampler = RandomSampler(train),
            batch_size = batch_size,
            num_workers = 0,
            drop_last=True
        )


validation_dataloader = DataLoader(
            validation, 
            sampler = SequentialSampler(validation), 
            batch_size = batch_size, 
            num_workers = 0,
            drop_last=True
        )

In [12]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)



In [13]:
epochs = 7
total_steps = len(train_dataloader) * epochs

In [14]:
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [15]:
DEVICE = 'cuda:0'

In [None]:
model.to(DEVICE)

In [17]:
def fit_batch(dataloader, model, optimizer, epoch):
    total_train_loss = 0

    for batch in tqdm(dataloader, desc=f"Training epoch:{epoch}", unit="batch"):
      
        input_ids, attention_masks, token_type_ids, labels = batch

        
        input_ids = input_ids.to(DEVICE)
        token_type_ids = token_type_ids.to(DEVICE)
        attention_masks = attention_masks.to(DEVICE)
        #labels = labels.long()
        labels = labels.to(DEVICE)

        loss = (model(input_ids=input_ids,
                      token_type_ids=token_type_ids,
                      attention_mask=attention_masks,
                      labels=labels)).loss

        total_train_loss += loss
        optimizer.zero_grad()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    return total_train_loss

In [41]:
def eval_batch(dataloader, model, metric=accuracy_score):
    total_eval_accuracy = 0
    total_eval_loss = 0
    predictions, predicted_labels = [], []
    notright = []
    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
   
        input_ids, attention_masks, token_type_ids, labels = batch

        input_ids = input_ids.to(DEVICE)
        token_type_ids = token_type_ids.to(DEVICE)
        attention_masks = attention_masks.to(DEVICE)
        labels = labels.to(DEVICE)
        with torch.no_grad():
           
            m = (model(input_ids,
                       token_type_ids=token_type_ids,
                       attention_mask=attention_masks,
                       labels=labels))
        total_eval_loss += m.loss

        y_pred = np.argmax(m.logits.detach().cpu().numpy(), axis=1).flatten()
        total_eval_accuracy += metric(labels.cpu(), y_pred)

        predictions.extend(m.logits.detach().tolist())
        predicted_labels.extend(y_pred.tolist())
        if y_pred !=labels:
            notright.append((input_ids, token_type_ids, attention_masks, labels, y_pred))
    return total_eval_accuracy, total_eval_loss, predictions, predicted_labels, notright


In [19]:
def train(train_dataloader, validation_dataloader, model, optimizer, epochs):

    training_stats = []



    for epoch in range(0, epochs):

        
        total_train_loss = 0

        model.train()

        total_train_loss = fit_batch(train_dataloader, model, optimizer, epoch)

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"  Train Loss: {avg_train_loss}")
   
        model.eval()

        total_eval_accuracy, total_eval_loss, _, _ = eval_batch(validation_dataloader, model)
        FILE = 'modelnew.pth'
        torch.save(model, FILE)
       
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)

        print(f"  Accuracy: {avg_val_accuracy}")

      
        avg_val_loss = total_eval_loss / len(validation_dataloader)


        print(f"  Validation Loss: {avg_val_loss}")

     
        training_stats.append(
            {
                'epoch': epoch,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
            }
        )

    print("")
    print("Training complete!")
    return training_stats

In [None]:
training_stats = train(train_dataloader, validation_dataloader, model, optimizer, epochs)

In [42]:
total_eval_accuracy, total_eval_loss, preds, predslab, notr = eval_batch(validation_dataloader, model)

Evaluating: 100%|██████████| 239/239 [00:22<00:00, 10.40batch/s]


In [45]:
len(notr)

239