In [26]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from data_cleaning_import import clean_create_vectors
import numpy as np
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [27]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

df = pd.read_csv("data.csv")
df = clean_create_vectors(df)

X = df["journal"].tolist()
y = df.drop(columns=["journal", "emotion_vectors", "activity_vectors"]).astype(int).values
label_names = df.drop(columns=["journal", "emotion_vectors", "activity_vectors"]).columns.tolist()
num_labels = len(label_names)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

class LemotifDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', 
                                  max_length=max_length, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = LemotifDataset(X_train, y_train, tokenizer)
test_dataset = LemotifDataset(X_test, y_test, tokenizer)

batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels,
    problem_type="multi_label_classification"
)
model.to(device)


optimizer = optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 4
loss_fn = nn.BCEWithLogitsLoss()


def train_epoch(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)


def evaluate(model, dataloader, loss_fn, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            loss = loss_fn(logits, labels)
            total_loss += loss.item()
            
            preds = torch.sigmoid(logits) > 0.5
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    
    return {
        'loss': total_loss / len(dataloader),
        'f1_micro': f1_score(all_labels, all_preds, average='micro'),
        'f1_macro': f1_score(all_labels, all_preds, average='macro')
    }


best_f1 = 0
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    

    train_loss = train_epoch(model, train_dataloader, optimizer, loss_fn, device)
    print(f"Training Loss: {train_loss:.4f}")
    

    eval_results = evaluate(model, test_dataloader, loss_fn, device)
    print(f"Eval Loss: {eval_results['loss']:.4f}")
    print(f"F1 Micro: {eval_results['f1_micro']:.4f}")
    print(f"F1 Macro: {eval_results['f1_macro']:.4f}")
    

    if eval_results['f1_micro'] > best_f1:
        best_f1 = eval_results['f1_micro']
        torch.save(model.state_dict(), "./best_model.pt")
        print("✓ New best model saved!")


model.load_state_dict(torch.load("./best_model.pt"))
model.eval()


all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        preds = torch.sigmoid(logits) > 0.5
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.numpy())

all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

print("\n Classification Report:\n")
print(classification_report(all_labels, all_preds, target_names=label_names, zero_division=0))

Using device: cpu


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/4


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 148/148 [06:00<00:00,  2.44s/it]


Training Loss: 0.3925


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.16it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Eval Loss: 0.2788
F1 Micro: 0.1510
F1 Macro: 0.0205
✓ New best model saved!

Epoch 2/4


Training: 100%|██████████| 148/148 [05:55<00:00,  2.40s/it]


Training Loss: 0.2670


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.08it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Eval Loss: 0.2393
F1 Micro: 0.2077
F1 Macro: 0.0610
✓ New best model saved!

Epoch 3/4


Training: 100%|██████████| 148/148 [05:47<00:00,  2.35s/it]


Training Loss: 0.2243


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.12it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Eval Loss: 0.2036
F1 Micro: 0.3959
F1 Macro: 0.1578
✓ New best model saved!

Epoch 4/4


Training: 100%|██████████| 148/148 [06:15<00:00,  2.53s/it]


Training Loss: 0.1899


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.16it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Eval Loss: 0.1832
F1 Micro: 0.5323
F1 Macro: 0.2736
✓ New best model saved!

🔍 Classification Report:

              precision    recall  f1-score   support

      afraid       0.00      0.00      0.00         2
       angry       0.00      0.00      0.00         5
     anxious       0.50      0.04      0.08        23
     ashamed       0.00      0.00      0.00         4
     awkward       0.00      0.00      0.00         4
       bored       0.00      0.00      0.00        15
        calm       0.75      0.10      0.18        87
    confused       0.00      0.00      0.00         7
   disgusted       0.00      0.00      0.00         3
     excited       0.00      0.00      0.00        46
  frustrated       0.60      0.21      0.32        28
       happy       0.73      0.73      0.73       153
     jealous       0.00      0.00      0.00         0
   nostalgic       0.00      0.00      0.00         6
       proud       0.82      0.14      0.23        66
         sad       0.00      0.0

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


df = pd.read_csv("data.csv")
df = clean_create_vectors(df)

X = df["journal"].tolist()
y = df.drop(columns=["journal", "emotion_vectors", "activity_vectors"]).astype(int).values
label_names = df.drop(columns=["journal", "emotion_vectors", "activity_vectors"]).columns.tolist()
num_labels = len(label_names)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

class LemotifDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', 
                                  max_length=max_length, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = LemotifDataset(X_train, y_train, tokenizer)
test_dataset = LemotifDataset(X_test, y_test, tokenizer)

batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels,
    problem_type="multi_label_classification"
)
model.to(device)

# Calculate class weights to address class imbalance
class_weights = []
for i in range(num_labels):
    
    y_i = y_train[:, i]
    if len(np.unique(y_i)) > 1:  
        weights = compute_class_weight('balanced', classes=np.unique(y_i), y=y_i)
        class_weights.append(weights[1] if len(weights) > 1 else 1.0)
    else:
        class_weights.append(1.0)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class weights:", class_weights)

# Changed training parameters 
learning_rate = 1e-5  # Lower learning rate 
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
num_epochs = 10  # Increased number of epochs
loss_fn = nn.BCEWithLogitsLoss(pos_weight=class_weights)  # Using class weights in loss

# Added learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1)


def train_epoch(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)


def evaluate(model, dataloader, loss_fn, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            loss = loss_fn(logits, labels)
            total_loss += loss.item()
            
            
            preds = torch.sigmoid(logits) > 0.5
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    
    return {
        'loss': total_loss / len(dataloader),
        'f1_micro': f1_score(all_labels, all_preds, average='micro'),
        'f1_macro': f1_score(all_labels, all_preds, average='macro'),
        'f1_weighted': f1_score(all_labels, all_preds, average='weighted'),
        'preds': all_preds,
        'labels': all_labels
    }


best_f1 = 0
best_epoch = 0
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    
   
    train_loss = train_epoch(model, train_dataloader, optimizer, loss_fn, device)
    print(f"Training Loss: {train_loss:.4f}")
    
 
    eval_results = evaluate(model, test_dataloader, loss_fn, device)
    print(f"Eval Loss: {eval_results['loss']:.4f}")
    print(f"F1 Micro: {eval_results['f1_micro']:.4f}")
    print(f"F1 Macro: {eval_results['f1_macro']:.4f}")
    print(f"F1 Weighted: {eval_results['f1_weighted']:.4f}")
    
    
    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(eval_results['f1_micro'])
    new_lr = optimizer.param_groups[0]['lr']
    if new_lr != old_lr:
        print(f"Learning rate changed from {old_lr} to {new_lr}")
    
  
    if eval_results['f1_micro'] > best_f1:
        best_f1 = eval_results['f1_micro']
        best_epoch = epoch + 1
        torch.save(model.state_dict(), "./best_model.pt")
        print("✓ New best model saved!")
    
    # Early stopping 
    if epoch - best_epoch >= 3:  
        print("Early stopping triggered")
        break

print(f"\nBest model was from epoch {best_epoch} with F1 micro of {best_f1:.4f}")


model.load_state_dict(torch.load("./best_model.pt"))
model.eval()


all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        preds = torch.sigmoid(logits) > 0.5
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.numpy())

all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

print("\n Classification Report:\n")
print(classification_report(all_labels, all_preds, target_names=label_names, zero_division=0))


class_correct = (all_preds == all_labels).sum(axis=0)
class_total = all_labels.shape[0]
class_accuracy = class_correct / class_total

print("\n Per-Class Accuracy:")
for i, label in enumerate(label_names):
    print(f"{label}: {class_accuracy[i]:.4f}")


torch.save({
    'model_state_dict': model.state_dict(),
    'tokenizer': tokenizer,
    'label_names': label_names,
    'num_labels': num_labels
}, "emotion_classification_model_complete.pt")
print("\n Complete model saved to 'emotion_classification_model_complete.pt'")

Using device: cpu


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights: tensor([ 36.8125,  25.6087,   5.7745,  45.3077,  53.5455,  17.3235,   2.0961,
         28.0476,  31.0000,   2.8732,   5.2124,   1.0208, 196.3333,  10.7091,
          2.1734,  15.9189,   1.2612,  10.9074,   3.9007,   2.6532,   3.6358,
          7.3625,  12.2708,   8.0685,  10.3333,   8.0685,  32.7222,   5.6635,
          3.1330])

Epoch 1/10


Training: 100%|██████████| 148/148 [05:58<00:00,  2.42s/it]


Training Loss: 0.9256


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.13it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Eval Loss: 0.8606
F1 Micro: 0.2342
F1 Macro: 0.0727
F1 Weighted: 0.1628
✓ New best model saved!

Epoch 2/10


Training: 100%|██████████| 148/148 [06:01<00:00,  2.44s/it]


Training Loss: 0.8280


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.11it/s]


Eval Loss: 0.7630
F1 Micro: 0.3597
F1 Macro: 0.2584
F1 Weighted: 0.3444
✓ New best model saved!

Epoch 3/10


Training: 100%|██████████| 148/148 [05:55<00:00,  2.40s/it]


Training Loss: 0.7311


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.17it/s]


Eval Loss: 0.6924
F1 Micro: 0.4524
F1 Macro: 0.3543
F1 Weighted: 0.4482
✓ New best model saved!

Epoch 4/10


Training: 100%|██████████| 148/148 [05:51<00:00,  2.37s/it]


Training Loss: 0.6510


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.11it/s]


Eval Loss: 0.6503
F1 Micro: 0.4894
F1 Macro: 0.3901
F1 Weighted: 0.4958
✓ New best model saved!

Epoch 5/10


Training: 100%|██████████| 148/148 [05:45<00:00,  2.34s/it]


Training Loss: 0.5827


Evaluating: 100%|██████████| 37/37 [00:16<00:00,  2.21it/s]


Eval Loss: 0.6133
F1 Micro: 0.5065
F1 Macro: 0.4089
F1 Weighted: 0.5452
✓ New best model saved!

Epoch 6/10


Training: 100%|██████████| 148/148 [05:37<00:00,  2.28s/it]


Training Loss: 0.5268


Evaluating: 100%|██████████| 37/37 [00:16<00:00,  2.19it/s]


Eval Loss: 0.5710
F1 Micro: 0.5235
F1 Macro: 0.4289
F1 Weighted: 0.5667
✓ New best model saved!

Epoch 7/10


Training: 100%|██████████| 148/148 [05:48<00:00,  2.36s/it]


Training Loss: 0.4828


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.16it/s]


Eval Loss: 0.5589
F1 Micro: 0.5423
F1 Macro: 0.4338
F1 Weighted: 0.5782
✓ New best model saved!

Epoch 8/10


Training: 100%|██████████| 148/148 [05:50<00:00,  2.37s/it]


Training Loss: 0.4412


Evaluating: 100%|██████████| 37/37 [00:18<00:00,  2.03it/s]


Eval Loss: 0.5357
F1 Micro: 0.5512
F1 Macro: 0.4624
F1 Weighted: 0.5802
✓ New best model saved!

Epoch 9/10


Training: 100%|██████████| 148/148 [05:56<00:00,  2.41s/it]


Training Loss: 0.4043


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.14it/s]


Eval Loss: 0.5334
F1 Micro: 0.5552
F1 Macro: 0.4585
F1 Weighted: 0.6006
✓ New best model saved!

Epoch 10/10


Training: 100%|██████████| 148/148 [06:00<00:00,  2.44s/it]


Training Loss: 0.3744


Evaluating: 100%|██████████| 37/37 [00:17<00:00,  2.10it/s]


Eval Loss: 0.5491
F1 Micro: 0.5856
F1 Macro: 0.4784
F1 Weighted: 0.6143
✓ New best model saved!

Best model was from epoch 10 with F1 micro of 0.5856

🔍 Classification Report:

              precision    recall  f1-score   support

      afraid       0.09      1.00      0.16         2
       angry       0.12      0.60      0.20         5
     anxious       0.42      0.74      0.54        23
     ashamed       0.14      0.75      0.23         4
     awkward       0.07      0.25      0.11         4
       bored       0.32      0.53      0.40        15
        calm       0.54      0.49      0.52        87
    confused       0.19      0.57      0.29         7
   disgusted       0.04      0.33      0.08         3
     excited       0.33      0.17      0.23        46
  frustrated       0.46      0.68      0.55        28
       happy       0.74      0.69      0.71       153
     jealous       0.00      0.00      0.00         0
   nostalgic       0.16      0.50      0.24         6
       proud