In [1]:
import pandas as pd
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import os
import numpy as np
import random
from sklearn.metrics import precision_recall_curve, classification_report
import re
import string
from torch.nn import CrossEntropyLoss, MSELoss

In [2]:
df = pd.read_csv("../input/traincleaned/traincleaned.csv")

In [3]:
def seed_everything(seed = 1234):
     random.seed(seed)
     os.environ['PYTHONHASHSEED'] = str(seed)
     np.random.seed(seed)     
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.backends.cudnn.deterministic = True

In [4]:
seed_everything()

In [None]:
raw_model = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(raw_model, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(
    raw_model, 
    num_labels=2, 
    output_attentions=False,
    output_hidden_states=True, 
)

In [7]:
def convert_to_dataset_torch(data: pd.DataFrame, labels: pd.Series) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["name_1"], row["name_2"], max_length=128,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, return_tensors='pt', truncation=True)

        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        attention_masks.append(encoded_dict['attention_mask'])



    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values)
    input_ids.to(dtype=torch.long)
    token_type_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)
    labels.to(dtype=torch.long)

    return TensorDataset(input_ids, attention_masks, token_type_ids, labels)

In [8]:
X_train, X_validation, y_train, y_validation = train_test_split(df[["name_1", "name_2"]],
                                                    df["is_duplicate"], test_size=0.3, random_state=21, stratify=df["is_duplicate"])

In [9]:
train = convert_to_dataset_torch(X_train, y_train)
validation = convert_to_dataset_torch(X_validation, y_validation)

100%|██████████| 348473/348473 [02:08<00:00, 2721.80it/s]
100%|██████████| 149346/149346 [00:52<00:00, 2870.95it/s]


In [10]:
batch_size = 16

In [11]:
train_dataloader = DataLoader(
            train,  
            sampler = RandomSampler(train),
            batch_size = batch_size,
            num_workers = 0,
            drop_last=True
        )


validation_dataloader = DataLoader(
            validation, 
            sampler = SequentialSampler(validation), 
            batch_size = batch_size, 
            num_workers = 0,
            drop_last=True
        )

In [12]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)



In [13]:
DEVICE = 'cuda:0'

In [None]:
model.to(DEVICE)

In [15]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    
    return {"acc": acc, "f1": f1}

In [16]:
epochs = 3

In [None]:
model.train()

weights = torch.tensor([1, 20], dtype=torch.float, device=DEVICE)

for _ in range(epochs):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(DEVICE) for t in batch)
        
        input_ids, input_mask, segment_ids, label_ids = batch
        
        logits = model(input_ids, segment_ids, input_mask, labels=None).logits
        #print(logits)
        loss_fct = CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, 2), label_ids.view(-1))
        
        optimizer.zero_grad()
        loss.backward()
        print(loss)
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        
        optimizer.step()
        #global_step += 1
        
    print("Training Loss: %s" % (str(tr_loss)))

In [23]:
torch.save(model, 'modelfull.pth')

In [30]:
y_validation

59663     0
161264    0
320084    0
30945     0
357593    0
         ..
16262     0
470471    0
439181    0
355880    0
417045    0
Name: is_duplicate, Length: 149346, dtype: int64

In [29]:
eval_loss = 0
nb_eval_steps = 0
preds = []


NameError: name 'eval_features' is not defined

In [28]:
model.eval()
for input_ids, input_mask, segment_ids, label_ids in tqdm(validation_dataloader, desc="Evaluating"):
    input_ids = input_ids.to(DEVICE)
    input_mask = input_mask.to(DEVICE)
    segment_ids = segment_ids.to(DEVICE)
    label_ids = label_ids.to(DEVICE)

    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask, labels=None).logits

    # create eval loss and other metric required by the task
    loss_fct = CrossEntropyLoss()
    tmp_eval_loss = loss_fct(logits.view(-1, 2), label_ids.view(-1))

    eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps += 1
    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

eval_loss = eval_loss / nb_eval_steps
preds = preds[0]
preds = np.argmax(preds, axis=1)

result = acc_and_f1(preds, all_label_ids.numpy())
print(result)

Evaluating: 100%|██████████| 9334/9334 [09:21<00:00, 16.63it/s]


NameError: name 'all_label_ids' is not defined