<a href="https://colab.research.google.com/github/Gavinchen104/BERT_classification/blob/main/BERT1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
label_mapping = {
  'R': 0,
  'java': 1,
  'javascript': 2,
  'php': 3,
  'python': 4
}

def load_data(data_file):
  df = pd.read_csv(data_file, sep=';')

  # Apply the custom label mapping to the 'label' column
  df['category'] = df['label'].map(label_mapping)
  df_filtered = df.groupby('category').apply(lambda x: x.sample(n=400, replace=True if len(x) < 300 else False)).reset_index(drop=True)

  titles = df_filtered['title'].tolist()
  labels = df_filtered['category'].tolist()

  return titles, labels

In [3]:
titles, labels = load_data('/content/full_dataset_v3.csv')

In [4]:
class TextClassificationDataset(Dataset):
  def __init__(self, text, label, tokenizer, max_length):
    self.text = text
    self.label = label
    self.tokenizer = tokenizer
    self.max_length = max_length
  def __len__(self):
    return len(self.text)
  def __getitem__(self, idx):
    text = self.text[idx]
    label = self.label[idx]
    encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
    return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [5]:
class BERTClassifier(nn.Module):
  def __init__(self, bert_model_name, num_classes):
    super(BERTClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(bert_model_name)
    self.dropout = nn.Dropout(0.1)
    self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = outputs.pooler_output
    x = self.dropout(pooled_output)
    logits = self.fc(x)
    return logits

In [6]:
def train(model, data_loader, optimizer, scheduler, device):
  model.train()
  for batch in data_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    loss = nn.CrossEntropyLoss()(outputs, labels)
    loss.backward()
    optimizer.step()
    scheduler.step()

In [7]:
def evaluate(model, data_loader, device):
  model.eval()
  predictions = []
  actual_labels = []
  with torch.no_grad():
      for batch in data_loader:
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['label'].to(device)
          outputs = model(input_ids=input_ids, attention_mask=attention_mask)
          _, preds = torch.max(outputs, dim=1)
          predictions.extend(preds.cpu().tolist())
          actual_labels.extend(labels.cpu().tolist())
  return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions, zero_division=1)

In [8]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).item()

    # Reverse mapping from numerical category back to label
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}

    return reverse_label_mapping.get(preds, "Unknown")

In [9]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 5
max_length = 256
batch_size = 20
num_epochs = 4
learning_rate = 2e-5

In [10]:
train_texts, val_texts, train_labels, val_labels = train_test_split(titles, labels, test_size=0.2, random_state=42)

In [11]:
import os
os.environ['HF_TOKEN'] = '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'
token = os.getenv("HF_TOKEN")


In [12]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [14]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
for epoch in range(num_epochs):
  print(f"Epoch {epoch + 1}/{num_epochs}")
  train(model, train_dataloader, optimizer, scheduler, device)
  accuracy, report = evaluate(model, val_dataloader, device)
  print(f"Validation Accuracy: {accuracy:.4f}")
  print(report)

Epoch 1/4


In [None]:
torch.save(model.state_dict(), "bert_classifier.pth")
test_text = "Package python software with pylucene dependency"
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print(f"Predicted sentiment: {sentiment}")