In [None]:
#install the packages
!pip install transformers




In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [None]:
# Load your training and testing datasets
train_data = pd.read_csv('Train_dataset.csv')
test_data = pd.read_csv('Test_dataset.csv')

In [None]:
#CSV has 'Text' and 'L1' columns
train_texts = list(train_data['Text'])
train_labels = list(train_data['L1'])

test_texts = list(test_data['Text'])
test_labels = list(test_data['L1'])

In [None]:
  # Tokenize and encode the text data
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
  train_encodings = tokenizer(train_texts, truncation=True, padding=True)
  test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
label_mapping = {"tech person": 0, "non-tech person": 1}
train_labels = [label_mapping[label] for label in train_labels]
test_labels = [label_mapping[label] for label in test_labels]


In [None]:
# Convert the encodings and labels to PyTorch tensors
train_inputs = {key: torch.tensor(val) for key, val in train_encodings.items()}
train_labels = torch.tensor(train_labels)

test_inputs = {key: torch.tensor(val) for key, val in test_encodings.items()}
test_labels = torch.tensor(test_labels)

In [None]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)



In [None]:
# Training loop
num_epochs = 3
batch_size = 16

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(train_labels), batch_size):
        batch_inputs = {key: val[i:i+batch_size] for key, val in train_inputs.items()}
        batch_labels = train_labels[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(**batch_inputs, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(**test_inputs)
        predicted_labels = np.argmax(test_outputs.logits.detach().numpy(), axis=1)
        accuracy = accuracy_score(test_labels, predicted_labels)
        print(f'Epoch {epoch + 1} - Test Accuracy: {accuracy:.4f}')

Epoch 1 - Test Accuracy: 0.4400
Epoch 2 - Test Accuracy: 0.5200
Epoch 3 - Test Accuracy: 0.7200
