In [14]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm import tqdm

# Load dataset
file_path = 'C:/Karan program/Movie/Flood_Response/train.csv'  # Upload this in Colab
df = pd.read_csv(file_path, encoding='latin-1') # or encoding='ISO-8859-1'
df = df[['text', 'choose_one']].dropna()
df['choose_one'] = df['choose_one'].apply(lambda x: 1 if x == 'Relevant' else 0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['choose_one'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text
max_length = 128
def tokenize_text(texts):
    return tokenizer(list(texts), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = tokenize_text(X_train)
test_encodings = tokenize_text(X_test)

# Convert labels to tensors
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create DataLoader
batch_size = 16
train_data = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], y_train_tensor)
test_data = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], y_test_tensor)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size)

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*3)

# Training loop
epochs = 3
loss_fn = torch.nn.CrossEntropyLoss()
model.train()
for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

# Evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

# Compute Metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Save Model
torch.save(model.state_dict(), 'bert_disaster_model.pth')
print("Model Training & Evaluation Complete! 🚀")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 544/544 [1:14:04<00:00,  8.17s/it, loss=0.549]
Epoch 2: 100%|██████████| 544/544 [1:35:10<00:00, 10.50s/it, loss=0.588]   
Epoch 3: 100%|██████████| 544/544 [3:04:15<00:00, 20.32s/it, loss=0.0755]      


Accuracy: 0.8364
Precision: 0.8214
Recall: 0.8025
F1 Score: 0.8118
Model Training & Evaluation Complete! 🚀
