In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

path = "GoEmotions/data/relabeled_dataset.csv"

df = pd.read_csv(path, usecols=["text", "id", "author", "subreddit", "parent_id", "created_utc", "label"])
df['parent_id'] = df['parent_id'].str[3:]

merged = df.merge(df, left_on='parent_id', right_on='id', suffixes=('_child', '_parent'))

result = pd.DataFrame({
    'parent_text': merged['text_parent'],
    'label': (merged['label_child'] == 3).astype(int)
})

parents_only = df[df['label'].isin([1, 3])]

parents_only_labels = (parents_only['label'] == 3).astype(int)

parents_only = parents_only.assign(
    label=parents_only_labels,
    rand=np.random.rand(len(parents_only))
)

parents_sampled = parents_only[parents_only['rand'] < 0.1]

parents_df = pd.DataFrame({
    'parent_text': parents_sampled['text'],
    'label': parents_sampled['label']
})

result = pd.concat([result, parents_df], ignore_index=True)

print(result.head())
result.to_csv("parent_child_labels.csv", index=False)
print("Saved to parent_child_labels.csv")

                                         parent_text  label
0  Ah, then apologies for my ignorance, and thank...      0
1  Given the current state of our relations with ...      1
2  From my experience, doctors are generally unde...      1
3                       It will be too late by then.      1
4  Sadly this story has died down why is he bring...      0
Saved to parent_child_labels.csv


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import os

os.environ["WANDB_DISABLED"] = "true"

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

class ParentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

texts = result['parent_text'].tolist()
labels = result['label'].tolist()
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

train_dataset = ParentDataset(train_texts, train_labels)
val_dataset = ParentDataset(val_texts, val_labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    prec = precision_score(labels, preds)
    rec = recall_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': prec,
        'recall': rec
    }

# Training config with eval
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    warmup_steps=100,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-6,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.save_model('./parent_model')
tokenizer.save_pretrained('./parent_model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6722,0.647854,0.605166,0.75174,0.60223,1.0
2,0.5786,0.57315,0.731857,0.798893,0.72408,0.890947
3,0.4481,0.530524,0.745387,0.797258,0.760748,0.837449
4,0.5477,0.514421,0.751538,0.793456,0.788618,0.798354
5,0.4173,0.509477,0.753998,0.791667,0.801688,0.781893
6,0.6029,0.506698,0.762608,0.803662,0.794769,0.812757
7,0.3952,0.507856,0.758918,0.797938,0.799587,0.796296
8,0.4427,0.507045,0.765068,0.806091,0.795591,0.816872
9,0.4582,0.507734,0.762608,0.803262,0.79596,0.8107
10,0.4094,0.508069,0.765068,0.805697,0.796781,0.814815


('./parent_model/tokenizer_config.json',
 './parent_model/special_tokens_map.json',
 './parent_model/vocab.txt',
 './parent_model/added_tokens.json')