In [49]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import torch 
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split
from torch.nn.functional import interpolate


In [38]:
train = pd.read_csv("../../data/twitter/train (1).csv")
test = pd.read_csv("../../data/twitter/test.csv")
print(f"Training data shape: {train.shape}")
print(f"Testing data shape: {test.shape}")

Training data shape: (7613, 5)
Testing data shape: (3263, 4)


In [19]:
def clean_data(df, column):
    """
    Clean the text data in the specified column.
    """
    df[column] = df[column].str.lower()
    df[column] = df[column].str.replace('[^\w\s]','')
    df[column] = df[column].str.replace('\d+', '')
    df[column] = df[column].str.replace('\n', '')
    df[column] = df[column].str.strip()
    
    return df

def set_device():
    """
    Set the device to use for training and inference.
    """
    print(f"PyTorch version: {torch.__version__}")

    # Check PyTorch has access to MPS (Metal Performance Shader, Apple's GPU architecture)
    print(f"Is MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
    print(f"Is MPS available? {torch.backends.mps.is_available()}")

    # Set the device      
    if torch.backends.mps.is_available():
        device = "mps"
    elif torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using device: {device}")

    return device

def set_deterministic():
    """
    Set deterministic behavior for reproducibility.
    """
    if torch.backends.cudnn.is_available():
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    elif torch.backends.mps.is_available():
        # Currently, PyTorch-Metal (MPS backend) does not provide a direct way to set deterministic behavior.
        pass
    print("Set deterministic behavior")
    

In [39]:
df_train = clean_data(train, 'text')
df_test = clean_data(test, 'text')

In [40]:
df_train = df_train.drop('location', axis=1, inplace=False)
df_test = df_test.drop('location', axis=1, inplace=False)

In [41]:
train_data, test_data = train_test_split(df_train, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

print(f"Traing examples: {train_data.shape[0]}")
print(f"Validation examples: {val_data.shape[0]}")
print(f"Testing examples: {test_data.shape[0]}")

Traing examples: 5481
Validation examples: 609
Testing examples: 1523


In [78]:
BATCH_SIZE = 64
MAX_LEN = 128

In [85]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

device = set_device()
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PyTorch version: 2.2.2
Is MPS (Metal Performance Shader) built? True
Is MPS available? True
Using device: mps


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [44]:
def tokenize(text): 
    """
    Tokenize the text data.
    """
    input_ids = []
    attention_masks = []
    
    for txt in text:
        encoded = tokenizer.encode_plus(
            txt,
            add_special_tokens=True,
            truncation=True,
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks


In [74]:
train_inputs, train_masks = tokenize(train_data['text'])
val_inputs, val_masks = tokenize(val_data['text'])
test_inputs, test_masks = tokenize(test_data['text'])


In [75]:
train_labels = torch.tensor(train_data['target'].values)
val_labels = torch.tensor(val_data['target'].values)
test_labels = torch.tensor(test_data['target'].values)

In [47]:
def create_data_loader(input_ids, attention_masks, labels, batch_size, train=True):
    """
    Create a DataLoader.
    """
    data = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
    if train:
        sampler = torch.utils.data.RandomSampler(data)
    else:
        sampler = torch.utils.data.SequentialSampler(data)
        
    return torch.utils.data.DataLoader(data, sampler=sampler, batch_size=batch_size)

In [48]:
train_loader = create_data_loader(train_inputs, train_masks, train_labels, BATCH_SIZE, train=True)
val_loader = create_data_loader(val_inputs, val_masks, val_labels, BATCH_SIZE, train=False)
test_loader = create_data_loader(test_inputs, test_masks, test_labels, BATCH_SIZE, train=False)

In [84]:
def train(model, train_loader, optimizer, scheduler, device):
    """
    Train the model.
    """
    model.train()
    total_loss = 0
    for i, (input_ids, attention_masks, labels) in enumerate(train_loader):
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)
            
        optimizer.zero_grad()
            
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
            
        total_loss += loss.item()
            
            
        if i % 1 == 0:
            print(f"Iteration: {i}, Loss: {loss.item()}")
                
    avg_loss = total_loss / len(train_loader)
        
    return avg_loss



In [71]:
def test(model, test_loader, device):
    """
    Test the model.
    """
    model.eval()
    total_loss = 0
    with torch.inference_mode():
        for i, (input_ids, attention_masks, labels) in enumerate(test_loader):
            input_ids = input_ids.to(device)
            attention_masks = attention_masks.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(test_loader)
    return avg_loss

In [54]:
def evaluate(model, eval_loader, device):
    """
    Evaluate the model.
    """
    model.eval()
    total_loss = 0
    total_preds = []
    
    with torch.no_grad():
        for i, (input_ids, attention_masks, labels) in enumerate(eval_loader):
            input_ids = input_ids.to(device)
            attention_masks = attention_masks.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            total_preds.append(logits)
            
        
        avg_loss = total_loss / len(eval_loader)
        total_preds = np.concatenate(total_preds, axis=0)
    return avg_loss, total_preds

In [79]:
LEARNING_RATE = 0.001
GRADIENT_ACCUMULATION_STEPS = 1
MIXUP_ALPHA = 0.4
EPOCHS = 1

In [86]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS // GRADIENT_ACCUMULATION_STEPS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criteria = torch.nn.CrossEntropyLoss()

for epoch in range(EPOCHS):
    best_val_loss = float('inf')
    
    train_loss = train(model, train_loader, optimizer, scheduler, device)
    val_loss, val_preds = evaluate(model, val_loader, device)
    
    print(f"Epoch: {epoch +1}, Training Loss: {train_loss}, Validation Loss: {val_loss}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        
        

Iteration: 0, Loss: 0.7074604630470276
Iteration: 1, Loss: 0.6612507104873657
Iteration: 2, Loss: 0.7150289416313171
Iteration: 3, Loss: 2.110337972640991
Iteration: 4, Loss: 0.6279337406158447
Iteration: 5, Loss: 1.4313682317733765
Iteration: 6, Loss: 0.7436719536781311
Iteration: 7, Loss: 0.9077308177947998
Iteration: 8, Loss: 0.5968892574310303
Iteration: 9, Loss: 0.8193389177322388
Iteration: 10, Loss: 0.6777430772781372
Iteration: 11, Loss: 0.6971752643585205
Iteration: 12, Loss: 0.7932015657424927
Iteration: 13, Loss: 0.7821317911148071
Iteration: 14, Loss: 0.7075538635253906
Iteration: 15, Loss: 0.7071996927261353
Iteration: 16, Loss: 0.7145175933837891
Iteration: 17, Loss: 0.6808634996414185
Iteration: 18, Loss: 0.7402410507202148
Iteration: 19, Loss: 0.6850036382675171
Iteration: 20, Loss: 0.6682379245758057
Iteration: 21, Loss: 0.6833858489990234
Iteration: 22, Loss: 0.7422881126403809
Iteration: 23, Loss: 0.7197451591491699
Iteration: 24, Loss: 0.684545636177063
Iteration: 2

KeyboardInterrupt: 