In [None]:
%pip install transformers numpy pandas torch

In [None]:
import tensorflow as tf

print(tf.__version__)

In [None]:
import torch

print(torch.__version__)

In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler


In [None]:
import transformers

print(transformers.__version__)

In [None]:
# Load dataset
data = pd.read_csv('train.csv')

In [None]:

data = data.sample(n=100000).reset_index(drop=True)
data.head()

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna()

In [None]:
# Preprocess data for Bert Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
attention_masks = []

for sent1, sent2 in zip(data['question1'], data['question2']):
    encoded_dict = tokenizer.encode_plus(sent1, sent2, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(data['is_duplicate'].values)

In [None]:
#Data Split
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=0.1)

In [None]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
#Creating Bert Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
from sklearn.metrics import accuracy_score

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 3
for _ in range(epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        inputs, masks, labels = batch
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in val_dataloader:
            inputs, masks, labels = batch
            outputs = model(inputs, attention_mask=masks)
            batch_predictions = torch.argmax(outputs.logits, dim=1).tolist()

            all_predictions.extend(batch_predictions)
            all_labels.extend(labels.tolist())

    accuracy = accuracy_score(all_labels, all_predictions)
    print(f"Accuracy: {accuracy:.4f}")