In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import numpy as np
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

df = pd.read_csv("data.csv")
df = clean_create_vectors(df)

X = df["journal"].tolist()
y = df.drop(columns=["journal", "emotion_vectors", "activity_vectors"]).astype(int).values
label_names = df.drop(columns=["journal", "emotion_vectors", "activity_vectors"]).columns.tolist()

num_labels = len(label_names)
num_labels_to_keep = 18

label_names = label_names[:num_labels_to_keep]
num_labels = len(label_names)

y = y[:, :num_labels_to_keep]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

class LemotifDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length',
                                  max_length=max_length, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = LemotifDataset(X_train, y_train, tokenizer)
test_dataset = LemotifDataset(X_test, y_test, tokenizer)

batch_size = 8

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels,
    problem_type="multi_label_classification"
)
model.to(device)

class_weights = []
for i in range(num_labels):

    y_i = y_train[:, i]
    if len(np.unique(y_i)) > 1:
        weights = compute_class_weight('balanced', classes=np.unique(y_i), y=y_i)
        class_weights.append(weights[1] if len(weights) > 1 else 1.0)
    else:
        class_weights.append(1.0)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

print("Class weights:", class_weights)
# Changed training parameters
learning_rate = 2e-5  # Lower learning rate
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
num_epochs = 10  # Increased number of epochs
loss_fn = nn.BCEWithLogitsLoss(pos_weight=class_weights)  # Using class weights in loss

# Added learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1)

def train_epoch(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


def evaluate(model, dataloader, loss_fn, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            preds = torch.sigmoid(logits) > 0.5
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.vstack(all_preds)[:, :num_labels_to_keep]
    all_labels = np.vstack(all_labels)[:, :num_labels_to_keep]

    return {
        'loss': total_loss / len(dataloader),
        'f1_micro': f1_score(all_labels, all_preds, average='micro', zero_division=0),
        'f1_macro': f1_score(all_labels, all_preds, average='macro', zero_division=0),
        'f1_weighted': f1_score(all_labels, all_preds, average='weighted'),
        'preds': all_preds,
        'labels': all_labels
    }


best_f1 = 0
best_epoch = 0
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")


    train_loss = train_epoch(model, train_dataloader, optimizer, loss_fn, device)
    print(f"Training Loss: {train_loss:.4f}")


    eval_results = evaluate(model, test_dataloader, loss_fn, device)
    print(f"Eval Loss: {eval_results['loss']:.4f}")
    print(f"F1 Micro: {eval_results['f1_micro']:.4f}")
    print(f"F1 Macro: {eval_results['f1_macro']:.4f}")
    print(f"F1 Weighted: {eval_results['f1_weighted']:.4f}")


    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(eval_results['f1_micro'])
    new_lr = optimizer.param_groups[0]['lr']
    if new_lr != old_lr:
        print(f"Learning rate changed from {old_lr} to {new_lr}")


    if eval_results['f1_micro'] > best_f1:
        best_f1 = eval_results['f1_micro']
        best_epoch = epoch + 1
        torch.save(model.state_dict(), "./best_model.pt")
        print("✓ New best model saved!")

    # Early stopping
    if epoch - best_epoch >= 3:
        print("Early stopping triggered")
        break

print(f"\nBest model was from epoch {best_epoch} with F1 micro of {best_f1:.4f}")

model.load_state_dict(torch.load("./best_model.pt"))
model.eval()


all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        preds = torch.sigmoid(logits) > 0.5
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.numpy())

all_preds = np.vstack(all_preds)[:, :num_labels_to_keep]
all_labels = np.vstack(all_labels)[:, :num_labels_to_keep]

print("\n Classification Report:\n")
print(classification_report(
    all_labels,
    all_preds,
    target_names=label_names,
    zero_division=0
))