In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [3]:
# Load data from CSV file
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Handling missing values (NaN) in text column
train_data = train_data.dropna(subset=['text'])
test_data = test_data.dropna(subset=['text'])

# Assuming the CSV files have 'text' and 'label' columns
texts = train_data['text'].tolist()
labels = train_data['label'].tolist()


In [4]:

# Preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128  # Define maximum sequence length

def preprocess_text(text):
    tokens = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
    return tokens

# Tokenize and Prepare Data
tokenized_texts = [preprocess_text(text) for text in texts]

input_ids = torch.cat([tokenized_texts[i]['input_ids'] for i in range(len(tokenized_texts))], dim=0)
attention_masks = torch.cat([tokenized_texts[i]['attention_mask'] for i in range(len(tokenized_texts))], dim=0)
labels = torch.tensor(labels)

# Split Data into Train and Test Sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks, test_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.2, random_state=42)

# Create DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
test_data = TensorDataset(test_inputs, test_masks, test_labels)

batch_size = 16
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)

# Load BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Training Loop
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if CUDA is available, otherwise use CPU
model.to(device)  # Move model to the available device

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    # Calculate average loss for this epoch
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}')

# Evaluation...


Epoch 1/3, Average Training Loss: 0.0936
Epoch 2/3, Average Training Loss: 0.0197
Epoch 3/3, Average Training Loss: 0.0101


In [6]:
# Evaluation
model.eval()
predictions, true_labels = [], []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=1).tolist())
    true_labels.extend(inputs['labels'].tolist())

# Calculate Accuracy and Metrics
accuracy = accuracy_score(true_labels, predictions)
report = classification_report(true_labels, predictions)

print(f'Accuracy: {accuracy}')
print(f'Classification Report: \n{report}')


Accuracy: 0.9915723573320491
Classification Report: 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2079
           1       0.99      0.99      0.99      2074

    accuracy                           0.99      4153
   macro avg       0.99      0.99      0.99      4153
weighted avg       0.99      0.99      0.99      4153

