In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train_fil.csv')
test = pd.read_csv('test_fil.csv')
val = pd.read_csv('val_fil.csv')

In [3]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',  # Return PyTorch tensors
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }



In [4]:
!pip install transformers



In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [6]:
import pandas as pd

# Assuming 'df' is your DataFrame with the text data and labels
train_texts = train['source'].tolist()  # Column containing text data
train_labels = train['source'].tolist()  # Column containing labels

# Assuming 'df' is your DataFrame with the text data and labels
val_texts = val['source'].tolist()  # Column containing text data
val_labels = val['source'].tolist()  # Column containing labels

# Assuming 'df' is your DataFrame with the text data and labels
test_texts = test['source'].tolist()  # Column containing text data
test_labels = test['source'].tolist()  # Column containing labels



In [7]:
!pip install scikit-learn



In [8]:
# Create the DMatrix with training data

from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the labels to numeric categories
train_labels_encoded = label_encoder.fit_transform(train_labels)

val_labels_encoded = label_encoder.transform(val_labels)

test_labels_encoded = label_encoder.transform(test_labels)


In [9]:
# Example usage:
train_dataset = TextDataset(train_texts, train_labels_encoded, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Example usage:
val_dataset = TextDataset(val_texts, val_labels_encoded, tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True)

# Example usage:
test_dataset = TextDataset(test_texts, train_labels_encoded, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)



In [10]:
import torch

# Setting up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [11]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(train_labels)))
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
from transformers import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import torch

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = CrossEntropyLoss()

num_epochs = 5

for epoch in tqdm(range(num_epochs), desc="Epochs"):
    # Training phase
    model.train()
    running_loss = 0.0
    for i, batch in enumerate(tqdm(train_dataloader, desc="Training Batch", leave=False), 1):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 100 == 0:  # Print average loss every 100 batches
            print(f"Batch {i}: Running Average Training Loss: {running_loss / 100:.4f}")
            running_loss = 0.0

    # Clear unused memory after every training cycle
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Validation phase
    model.eval()
    val_running_loss = 0.0
    val_steps = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation Batch", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            val_running_loss += loss.item()
            val_steps += 1

    average_training_loss = running_loss / len(train_dataloader)
    average_validation_loss = val_running_loss / val_steps
    print(f"Epoch {epoch}: Average Training Loss: {average_training_loss:.4f}, Average Validation Loss: {average_validation_loss:.4f}")

    # Optional: clear more aggressively
    del input_ids, attention_mask, labels, outputs
    torch.cuda.empty_cache()  # Clear any cached memory


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 100: Running Average Training Loss: 1.8605




Batch 200: Running Average Training Loss: 0.6113




Batch 300: Running Average Training Loss: 0.2687




Batch 400: Running Average Training Loss: 0.1514




Batch 500: Running Average Training Loss: 0.0955




Batch 600: Running Average Training Loss: 0.0643




Batch 700: Running Average Training Loss: 0.0466




Batch 800: Running Average Training Loss: 0.0375




Batch 900: Running Average Training Loss: 0.0299




Batch 1000: Running Average Training Loss: 0.0236




Batch 1100: Running Average Training Loss: 0.0198




Batch 1200: Running Average Training Loss: 0.0168




Batch 1300: Running Average Training Loss: 0.0149




Batch 1400: Running Average Training Loss: 0.0126




Batch 1500: Running Average Training Loss: 0.0111




Batch 1600: Running Average Training Loss: 0.0099




Batch 1700: Running Average Training Loss: 0.0088




Batch 1800: Running Average Training Loss: 0.0077




Batch 1900: Running Average Training Loss: 0.0070




Batch 2000: Running Average Training Loss: 0.0062




Batch 2100: Running Average Training Loss: 0.0057




Batch 2200: Running Average Training Loss: 0.0052




Batch 2300: Running Average Training Loss: 0.0047




Batch 2400: Running Average Training Loss: 0.0044




Batch 2500: Running Average Training Loss: 0.0041




Batch 2600: Running Average Training Loss: 0.0037




Batch 2700: Running Average Training Loss: 0.0035




Batch 2800: Running Average Training Loss: 0.0032




Batch 2900: Running Average Training Loss: 0.0030




Batch 3000: Running Average Training Loss: 0.0027




Batch 3100: Running Average Training Loss: 0.0025




Batch 3200: Running Average Training Loss: 0.0023




Batch 3300: Running Average Training Loss: 0.0022




Batch 3400: Running Average Training Loss: 0.0021




Batch 3500: Running Average Training Loss: 0.0019




Batch 3600: Running Average Training Loss: 0.0018




Batch 3700: Running Average Training Loss: 0.0017




Batch 3800: Running Average Training Loss: 0.0016




Batch 3900: Running Average Training Loss: 0.0015


Epochs:  20%|██        | 1/5 [1:31:56<6:07:46, 5516.67s/it]

Epoch 0: Average Training Loss: 0.0000, Average Validation Loss: 0.0009




Batch 100: Running Average Training Loss: 0.0014




Batch 200: Running Average Training Loss: 0.0012




Batch 300: Running Average Training Loss: 0.0012




Batch 400: Running Average Training Loss: 0.0011




Batch 500: Running Average Training Loss: 0.0010




Batch 600: Running Average Training Loss: 0.0010




Batch 700: Running Average Training Loss: 0.0009




Batch 800: Running Average Training Loss: 0.0009




Batch 900: Running Average Training Loss: 0.0008




Batch 1000: Running Average Training Loss: 0.0008




Batch 1100: Running Average Training Loss: 0.0007




Batch 1200: Running Average Training Loss: 0.0007




Batch 1300: Running Average Training Loss: 0.0006




Batch 1400: Running Average Training Loss: 0.0006




Batch 1500: Running Average Training Loss: 0.0006




Batch 1600: Running Average Training Loss: 0.0006




Batch 1700: Running Average Training Loss: 0.0005




Batch 1800: Running Average Training Loss: 0.0005




Batch 1900: Running Average Training Loss: 0.0005




Batch 2000: Running Average Training Loss: 0.0004




Batch 2100: Running Average Training Loss: 0.0004




Batch 2200: Running Average Training Loss: 0.0004




Batch 2300: Running Average Training Loss: 0.0004




Batch 2400: Running Average Training Loss: 0.0004




Batch 2500: Running Average Training Loss: 0.0003




Batch 2600: Running Average Training Loss: 0.0003




Batch 2700: Running Average Training Loss: 0.0003




Batch 2800: Running Average Training Loss: 0.0003




Batch 2900: Running Average Training Loss: 0.0003




Batch 3000: Running Average Training Loss: 0.0003




Batch 3100: Running Average Training Loss: 0.0002




Batch 3200: Running Average Training Loss: 0.0002




Batch 3300: Running Average Training Loss: 0.0002




Batch 3400: Running Average Training Loss: 0.0002




Batch 3500: Running Average Training Loss: 0.0002




Batch 3600: Running Average Training Loss: 0.0002




Batch 3700: Running Average Training Loss: 0.0002




Batch 3800: Running Average Training Loss: 0.0002




Batch 3900: Running Average Training Loss: 0.0002


Epochs:  40%|████      | 2/5 [3:03:50<4:35:44, 5514.83s/it]

Epoch 1: Average Training Loss: 0.0000, Average Validation Loss: 0.0001




Batch 100: Running Average Training Loss: 0.0002




Batch 200: Running Average Training Loss: 0.0001




Batch 300: Running Average Training Loss: 0.0001




Batch 400: Running Average Training Loss: 0.0001




Batch 500: Running Average Training Loss: 0.0001




Batch 600: Running Average Training Loss: 0.0001




Batch 700: Running Average Training Loss: 0.0001




Batch 800: Running Average Training Loss: 0.0001




Batch 900: Running Average Training Loss: 0.0001




Batch 1000: Running Average Training Loss: 0.0001




Batch 1100: Running Average Training Loss: 0.0001




Batch 1200: Running Average Training Loss: 0.0001




Batch 1300: Running Average Training Loss: 0.0001




Batch 1400: Running Average Training Loss: 0.0001




Batch 1500: Running Average Training Loss: 0.0001




Batch 1600: Running Average Training Loss: 0.0001




Batch 1700: Running Average Training Loss: 0.0001




Batch 1800: Running Average Training Loss: 0.0001




Batch 1900: Running Average Training Loss: 0.0001




Batch 2000: Running Average Training Loss: 0.0001




Batch 2100: Running Average Training Loss: 0.0001




Batch 2200: Running Average Training Loss: 0.0001




Batch 2300: Running Average Training Loss: 0.0000




Batch 2400: Running Average Training Loss: 0.0001




Batch 2500: Running Average Training Loss: 0.0000




Batch 2600: Running Average Training Loss: 0.0000




Batch 2700: Running Average Training Loss: 0.0000




Batch 2800: Running Average Training Loss: 0.0000




Batch 2900: Running Average Training Loss: 0.0000




Batch 3000: Running Average Training Loss: 0.0000




Batch 3100: Running Average Training Loss: 0.0000




Batch 3200: Running Average Training Loss: 0.0000




Batch 3300: Running Average Training Loss: 0.0000




Batch 3400: Running Average Training Loss: 0.0000




Batch 3500: Running Average Training Loss: 0.0000




Batch 3600: Running Average Training Loss: 0.0000




Batch 3700: Running Average Training Loss: 0.0000




Batch 3800: Running Average Training Loss: 0.0000




Batch 3900: Running Average Training Loss: 0.0000


Epochs:  60%|██████    | 3/5 [4:35:44<3:03:49, 5514.71s/it]

Epoch 2: Average Training Loss: 0.0000, Average Validation Loss: 0.0000




Batch 100: Running Average Training Loss: 0.0000




Batch 200: Running Average Training Loss: 0.0000




Batch 300: Running Average Training Loss: 0.0000




Batch 400: Running Average Training Loss: 0.0000




Batch 500: Running Average Training Loss: 0.0000




Batch 600: Running Average Training Loss: 0.0000




Batch 700: Running Average Training Loss: 0.0000




Batch 800: Running Average Training Loss: 0.0000




Batch 900: Running Average Training Loss: 0.0000




Batch 1000: Running Average Training Loss: 0.0000




Batch 1100: Running Average Training Loss: 0.0000




Batch 1200: Running Average Training Loss: 0.0000




Batch 1300: Running Average Training Loss: 0.0000




Batch 1400: Running Average Training Loss: 0.0000




Batch 1500: Running Average Training Loss: 0.0000




Batch 1600: Running Average Training Loss: 0.0000




Batch 1700: Running Average Training Loss: 0.0000




Batch 1800: Running Average Training Loss: 0.0000




Batch 1900: Running Average Training Loss: 0.0000




Batch 2000: Running Average Training Loss: 0.0000




Batch 2100: Running Average Training Loss: 0.0000




Batch 2200: Running Average Training Loss: 0.0000




Batch 2300: Running Average Training Loss: 0.0000




Batch 2400: Running Average Training Loss: 0.0000




Batch 2500: Running Average Training Loss: 0.0000




Batch 2600: Running Average Training Loss: 0.0000




Batch 2700: Running Average Training Loss: 0.0000




Batch 2800: Running Average Training Loss: 0.0000




Batch 2900: Running Average Training Loss: 0.0000




Batch 3000: Running Average Training Loss: 0.0000




Batch 3100: Running Average Training Loss: 0.0000




Batch 3200: Running Average Training Loss: 0.0000




Batch 3300: Running Average Training Loss: 0.0000




Batch 3400: Running Average Training Loss: 0.0000




Batch 3500: Running Average Training Loss: 0.0000




Batch 3600: Running Average Training Loss: 0.0000




Batch 3700: Running Average Training Loss: 0.0000




Batch 3800: Running Average Training Loss: 0.0000




Batch 3900: Running Average Training Loss: 0.0000


Epochs:  80%|████████  | 4/5 [6:07:38<1:31:54, 5514.45s/it]

Epoch 3: Average Training Loss: 0.0000, Average Validation Loss: 0.0000




Batch 100: Running Average Training Loss: 0.0000




Batch 200: Running Average Training Loss: 0.0000




Batch 300: Running Average Training Loss: 0.0000




Batch 400: Running Average Training Loss: 0.0000




Batch 500: Running Average Training Loss: 0.0000




Batch 600: Running Average Training Loss: 0.0000




Batch 700: Running Average Training Loss: 0.0000




Batch 800: Running Average Training Loss: 0.0000




Batch 900: Running Average Training Loss: 0.0000




Batch 1000: Running Average Training Loss: 0.0000




Batch 1100: Running Average Training Loss: 0.0000




Batch 1200: Running Average Training Loss: 0.0000




Batch 1300: Running Average Training Loss: 0.0000




Batch 1400: Running Average Training Loss: 0.0000




Batch 1500: Running Average Training Loss: 0.0000




Batch 1600: Running Average Training Loss: 0.0000




Batch 1700: Running Average Training Loss: 0.0000




Batch 1800: Running Average Training Loss: 0.0000




Batch 1900: Running Average Training Loss: 0.0000




Batch 2000: Running Average Training Loss: 0.0000




Batch 2100: Running Average Training Loss: 0.0000




Batch 2200: Running Average Training Loss: 0.0000




Batch 2300: Running Average Training Loss: 0.0000




Batch 2400: Running Average Training Loss: 0.0000




Batch 2500: Running Average Training Loss: 0.0000




Batch 2600: Running Average Training Loss: 0.0000




Batch 2700: Running Average Training Loss: 0.0000




Batch 2800: Running Average Training Loss: 0.0000




Batch 2900: Running Average Training Loss: 0.0000




Batch 3000: Running Average Training Loss: 0.0000




Batch 3100: Running Average Training Loss: 0.0000




Batch 3200: Running Average Training Loss: 0.0000




Batch 3300: Running Average Training Loss: 0.0000




Batch 3400: Running Average Training Loss: 0.0000




Batch 3500: Running Average Training Loss: 0.0000




Batch 3600: Running Average Training Loss: 0.0000




Batch 3700: Running Average Training Loss: 0.0000




Batch 3800: Running Average Training Loss: 0.0000




Batch 3900: Running Average Training Loss: 0.0000


Epochs: 100%|██████████| 5/5 [7:39:32<00:00, 5514.51s/it]  

Epoch 4: Average Training Loss: 0.0000, Average Validation Loss: 0.0000





In [22]:
model.eval()
val_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False)
total, correct = 0, 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.argmax(dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

print(f'Validation Accuracy: {correct / total:.4f}')


NameError: name 'validation_dataset' is not defined