# Notebook for finetuning BERT

### The notebook is split up into the following sections:
1. Clean data
2. Split dataset into 80/20 split and tokenize
3. Finetune BERT
4. Evaluation metrics

In [2]:
# install packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import ttest_ind
import torch
from sklearn.model_selection import train_test_split
import re
from collections import Counter

# 1. Clean data

In [3]:
# loading in dataset
df = pd.read_csv("../Scraping/clean_reddit.csv")

In [4]:
# count
df["subreddit"].value_counts()

subreddit
CasualConversation    4044
depression            3864
Name: count, dtype: int64

### lets do a quick pronoun count

In [7]:
# First-person singular pronouns
FIRST_PERSON_PRONOUNS = {"i", "me", "my", "mine", "myself"}

# Count total number of first-person pronouns in a single post
def count_first_person(text):
    if not isinstance(text, str):
        return 0
    tokens = re.findall(r"\b\w+\b", text.lower())
    return sum(1 for t in tokens if t in FIRST_PERSON_PRONOUNS)

# Count each individual pronoun
def count_each_pronoun(text):
    if not isinstance(text, str):
        return Counter()
    tokens = re.findall(r"\b\w+\b", text.lower())
    return Counter(t for t in tokens if t in FIRST_PERSON_PRONOUNS)

# split by subreddit
df_dep = df[df["subreddit"] == "depression"]
df_ctrl = df[df["subreddit"] == "CasualConversation"]

# total and per-post counts 
def analyze_group(df_group, name):
    total_counter = Counter()
    per_post_counts = []

    for text in df_group["full_text"]:
        per_post_counts.append(count_first_person(text))
        total_counter.update(count_each_pronoun(text))

    print(f"\n========== {name.upper()} ==========")
    print("Total pronoun counts:", total_counter)
    print("Total posts:", len(df_group))
    print("Total pronouns:", sum(per_post_counts))
    print("Average per post:", sum(per_post_counts) / len(per_post_counts))
    print("Median per post:", sorted(per_post_counts)[len(per_post_counts)//2])

# Run analysis for both groups
analyze_group(df_dep, "Depression")
analyze_group(df_ctrl, "CasualConversation")


Total pronoun counts: Counter({'i': 91360, 'my': 25372, 'me': 15784, 'myself': 4715, 'mine': 142})
Total posts: 3864
Total pronouns: 137373
Average per post: 35.55201863354037
Median per post: 26

Total pronoun counts: Counter({'i': 28393, 'my': 8376, 'me': 4450, 'myself': 713, 'mine': 154})
Total posts: 4044
Total pronouns: 42086
Average per post: 10.40702274975272
Median per post: 7


### Lets look at the length of the posts from the dataset from r/depression

In [14]:
df_dep["char_len"] = df_dep["full_text"].str.len()
df_dep["char_len"].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dep["char_len"] = df_dep["full_text"].str.len()


count     3864.000000
mean      1543.149845
std       1548.748249
min         43.000000
25%        635.750000
50%       1103.000000
75%       1895.250000
max      18017.000000
Name: char_len, dtype: float64

### Lets look at the length of the posts from the dataset from r/CasualConversation

In [15]:
df_ctrl["char_len"] = df_ctrl["full_text"].str.len()
df_ctrl["char_len"].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ctrl["char_len"] = df_ctrl["full_text"].str.len()


count    4044.000000
mean      748.936944
std       667.918083
min        80.000000
25%       347.000000
50%       542.000000
75%       915.000000
max      8676.000000
Name: char_len, dtype: float64

### Lets look at the max length from both datasets

In [16]:
idx = df_dep["full_text"].apply(count_first_person).nlargest(1).index
df_dep.loc[idx, "full_text"].values

array(['I\'m 18 and my life is a mess of regret. I need to get this off my chest...\n\nI don\'t even know where to begin. I\'m writing this because my mental health is in the gutter and I need to just put it all out there. Maybe someone can make sense of it, because I can\'t.\n\nI was born in April 2006 in Lansing, Michigan. I\'m a guy. Ever since I was young, I have always been into girls. Like, when I was in elementary school I used to, oh god... this is embarrassing... I would pretend I dropped something like a pencil or an eraser look under the table I was sat at to see under the girls\' skirts in my class. I know, it\'s disgusting.\n\nI grew up and had a good childhood. I was a chubby kid back then. Now, when I started highschool, I used to have no discipline and no sense of hygiene. I didn\'t take showersâ€” only sometimes - like once a week or once every two weeks.\n\nI woke up, went to school, and didn\'t use the toilets in school since I hated them and thought they were disgus

In [17]:
idx = df_ctrl["full_text"].apply(count_first_person).nlargest(1).index
df_ctrl.loc[idx, "full_text"].values

array(['Tips on how to find hobbies as a 22 year old female, plus me spilling other mumbo jumbo!\n\nHey yall, bit of a random post to make but it feels necessary and I need advice. I often feel stuck with my free time because I donâ€™t have any time consuming hobbies. Sure, I enjoy to be crafty and creative and do set aside time to do things I enjoy such as making jewelry, thrifting, doing my makeup routine every morning, drawing, baking/cooking, etc. Butâ€¦ this stuff doesnâ€™t isolate a large portion of my time everyday. Iâ€™m a very social motivated person, and my favorite thing to do is spend time with my finance and my friends. Itâ€™s hard because we donâ€™t live together yet, so the concept of being within each otherâ€™s company even whilst doing separate things isnâ€™t a reality yet. Aside from work, we spend almost all our free time together, which I absolutely adore. \n\nWhen I was younger, I really enjoyed video games. My fiancÃ© (21m) still loves to play video games in his f

### I only keep relevant columns such as:

- ID
- full_text
- label

In [18]:
cols_to_drop = [
    "title", "selftext", "subreddit", "author",
    "created_utc", "created_dt", "score", "num_comments",
    "url", "link_flair_text", "source", "matched_keyword"
]

df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

In [19]:
# lets take a look at the final structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7908 entries, 0 to 7907
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         7908 non-null   object
 1   full_text  7908 non-null   object
 2   label      7908 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 185.5+ KB


# 3, Split data into 80/20 and tokenize

In [20]:
train_df, val_df = train_test_split(
    df,
    test_size=0.2,        # 20% validation
    stratify=df["label"], # mixing lavels
    random_state=42      
)

print("Train size:", len(train_df))
print("Validation size:", len(val_df))

Train size: 6326
Validation size: 1582


# Lets tokenize

In [8]:
# Loading in tokenizer
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [15]:
texts = train_df["full_text"].tolist()

enc = tokenizer(
    texts,
    padding=False,       # true lengths, no padding
    truncation=False,    # don't cut, we want full length
    add_special_tokens=True,
)

lengths = [len(ids) for ids in enc["input_ids"]]

print("Min length:", np.min(lengths))
print("Max length:", np.max(lengths))
print("Mean length:", np.mean(lengths))

print("\nPercentiles:")
for q in [50, 75, 90, 95, 99]:
    print(f"{q}th:", np.percentile(lengths, q))

# How many exceed BERT's 512-token limit?
over_512 = sum(l > 512 for l in lengths)
print("\n>512 tokens:", over_512, "out of", len(lengths),
      f"({over_512 / len(lengths) * 100:.2f}%)")


Token indices sequence length is longer than the specified maximum sequence length for this model (2401 > 512). Running this sequence through the model will result in indexing errors


Min length: 9
Max length: 4401
Mean length: 268.90756704980845

Percentiles:
50th: 181.0
75th: 326.0
90th: 549.0
95th: 777.8499999999995
99th: 1459.9599999999991

>512 tokens: 718 out of 6264 (11.46%)


#### alright, so since 11% of the dataset is longer than the allowed tokens, we will chunk them later

In [16]:
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

num_labels = 2  # depression vs casual
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# convert from pd to huggingface df
train_hf = Dataset.from_pandas(train_df.reset_index(drop=True))
val_hf   = Dataset.from_pandas(val_df.reset_index(drop=True))

train_hf, val_hf

(Dataset({
     features: ['id', 'full_text', 'label'],
     num_rows: 6264
 }),
 Dataset({
     features: ['id', 'full_text', 'label'],
     num_rows: 1567
 }))

In [18]:
# tokenize and chunk
MAX_LENGTH = 512  

def tokenize_and_chunk(examples):
    enc = tokenizer(
        examples["full_text"],
        max_length=MAX_LENGTH,
        truncation=True,               # allow splitting into multiple chunks
        padding=False,                 # padding will be done later by data collator
        return_overflowing_tokens=True,
        return_length=True,
    )

    # Map each chunk back to its original example index
    overflow_mapping = enc["overflow_to_sample_mapping"]

    labels = []
    sample_ids = []
    for i in overflow_mapping:
        labels.append(examples["label"][i])
        if "id" in examples:
            sample_ids.append(examples["id"][i])
        else:
            sample_ids.append(i)  # fallback index if no id column

    enc["labels"] = labels
    enc["sample_id"] = sample_ids

    return enc

In [19]:
# apply it to train and validation set
train_chunked = train_hf.map(
    tokenize_and_chunk,
    batched=True,
    remove_columns=train_hf.column_names,  
)

val_chunked = val_hf.map(
    tokenize_and_chunk,
    batched=True,
    remove_columns=val_hf.column_names,
)

train_chunked, val_chunked

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/6264 [00:00<?, ? examples/s]

Map:   0%|          | 0/1567 [00:00<?, ? examples/s]

(Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'labels', 'sample_id'],
     num_rows: 7252
 }),
 Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping', 'labels', 'sample_id'],
     num_rows: 1805
 }))

In [20]:
# handles padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:

print(train_df["label"].value_counts())
print("\nProportions:")
print(train_df["label"].value_counts(normalize=True))


label
0    3206
1    3058
Name: count, dtype: int64

Proportions:
label
0    0.511814
1    0.488186
Name: proportion, dtype: float64


# pipeline and models

In [22]:
import json
from pathlib import Path

import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from transformers import (
    BertForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

MODEL_NAME = "bert-base-uncased"

# 1. Metrics (accuracy, precision, recall, F1)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds),
    }

# 2. Helper: run one experiment + save model + metrics
def run_experiment(name: str, learning_rate: float, num_epochs: int):
    """
    Trains a fresh BERT model with given hyperparameters,
    evaluates on val_chunked, saves:
      - model in ./runs/{name}/best_model
      - eval metrics in ./runs/{name}/eval_results_final.json
    and returns the metrics dict.
    """
    print(f"\n===== Running experiment: {name} =====")
    print(f"lr={learning_rate}, epochs={num_epochs}")

    # fresh model for each run
    model = BertForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2,
    )

    out_dir = Path(f"./runs/{name}")

    # NOTE: only using arguments that exist in older Transformers versions
    training_args = TrainingArguments(
        output_dir=str(out_dir),
        learning_rate=learning_rate,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_steps=100,
        seed=42,
        # no evaluation_strategy, save_strategy, load_best_model_at_end here
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_chunked,
        eval_dataset=val_chunked,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # ----- Train -----
    trainer.train()

    # ----- Evaluate on validation set (one time, at the end) -----
    metrics = trainer.evaluate()
    print(f"\nResults for {name}:")
    print(metrics)

    # ----- Ensure output dir exists -----
    out_dir.mkdir(parents=True, exist_ok=True)

    # ----- Save metrics to JSON -----
    with open(out_dir / "eval_results_final.json", "w") as f:
        json.dump(metrics, f, indent=2)

    # ----- Save the final model (treat as 'best') -----
    # This will create ./runs/{name}/best_model/ with config + weights + tokenizer
    trainer.save_model(out_dir / "best_model")

    return metrics

# 3. Run your three experiments (A, B, C)
results_default = run_experiment(
    name="bert_default_lr2e-5_ep3",
    learning_rate=2e-5,
    num_epochs=3,
)

results_tuned_epochs = run_experiment(
    name="bert_tuned_lr2e-5_ep4",
    learning_rate=2e-5,
    num_epochs=4,
)

results_tuned_lr = run_experiment(
    name="bert_tuned_lr3e-5_ep3",
    learning_rate=3e-5,
    num_epochs=3,
)

print("\nSummary of results:")
print("Model A (2e-5, 3 ep):", results_default)
print("Model B (2e-5, 4 ep):", results_tuned_epochs)
print("Model C (3e-5, 3 ep):", results_tuned_lr)



===== Running experiment: bert_default_lr2e-5_ep3 =====
lr=2e-05, epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


KeyboardInterrupt: 

# TESTING FOR OVERFITTING

In [20]:
from pathlib import Path
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

MODEL_NAME = "bert-base-uncased"
RUN_NAME = "bert_default_lr2e-5_ep3"  # Model A

model_path = Path(f"./runs/{RUN_NAME}/best_model")

# Reload the trained model
model_a = BertForSequenceClassification.from_pretrained(model_path)

# Reuse the same data_collator and tokenizer you already have
# data_collator, tokenizer, train_chunked, val_chunked assumed defined

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds),
    }

# Dummy TrainingArguments (we're not training, just using Trainer for predict/evaluate)
eval_args = TrainingArguments(
    output_dir=f"./runs/{RUN_NAME}/eval_again",
    per_device_eval_batch_size=8,
    do_train=False,
    do_eval=True,
    do_predict=True,
    seed=42,
)

trainer_a = Trainer(
    model=model_a,
    args=eval_args,
    train_dataset=train_chunked,
    eval_dataset=val_chunked,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer_a = Trainer(


In [21]:
# Validation predictions
val_pred = trainer_a.predict(val_chunked)
y_val_true = val_pred.label_ids
y_val_pred = val_pred.predictions.argmax(-1)

print("Validation metrics (Model A):")
print("  accuracy :", accuracy_score(y_val_true, y_val_pred))
print("  precision:", precision_score(y_val_true, y_val_pred))
print("  recall   :", recall_score(y_val_true, y_val_pred))
print("  f1       :", f1_score(y_val_true, y_val_pred))

cm_val = confusion_matrix(y_val_true, y_val_pred)
print("\nValidation confusion matrix (rows=true, cols=pred):")
print(cm_val)




Validation metrics (Model A):
  accuracy : 0.96398891966759
  precision: 0.9650565262076053
  recall   : 0.9680412371134021
  f1       : 0.9665465774575399

Validation confusion matrix (rows=true, cols=pred):
[[801  34]
 [ 31 939]]


In [23]:
# Train predictions
train_pred = trainer_a.predict(train_chunked)
y_train_true = train_pred.label_ids
y_train_pred = train_pred.predictions.argmax(-1)

print("\nTrain metrics (Model A):")
print("  accuracy :", accuracy_score(y_train_true, y_train_pred))
print("  precision:", precision_score(y_train_true, y_train_pred))
print("  recall   :", recall_score(y_train_true, y_train_pred))
print("  f1       :", f1_score(y_train_true, y_train_pred))

cm_train = confusion_matrix(y_train_true, y_train_pred)
print("\nTrain confusion matrix (rows=true, cols=pred):")
print(cm_train)


NameError: name 'trainer_a' is not defined

In [11]:
from pathlib import Path
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

RUN_NAME = "bert_tuned_lr3e-5_ep3"  # Model C's run name
model_path = Path(f"./runs/{RUN_NAME}/best_model")

model_c = BertForSequenceClassification.from_pretrained(model_path)

eval_args = TrainingArguments(
    output_dir=f"./runs/{RUN_NAME}/eval_again",
    per_device_eval_batch_size=8,
    do_eval=True,
    do_predict=True,
    seed=42,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds),
    }

trainer_c = Trainer(
    model=model_c,
    args=eval_args,
    train_dataset=train_chunked,
    eval_dataset=val_chunked,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Validation metrics (you basically already have these, but for completeness)
val_pred = trainer_c.predict(val_chunked)
y_val_true = val_pred.label_ids
y_val_pred = val_pred.predictions.argmax(-1)

print("Validation metrics (Model C):")
print("  accuracy :", accuracy_score(y_val_true, y_val_pred))
print("  precision:", precision_score(y_val_true, y_val_pred))
print("  recall   :", recall_score(y_val_true, y_val_pred))
print("  f1       :", f1_score(y_val_true, y_val_pred))

# Train metrics
train_pred = trainer_c.predict(train_chunked)
y_train_true = train_pred.label_ids
y_train_pred = train_pred.predictions.argmax(-1)

print("\nTrain metrics (Model C):")
print("  accuracy :", accuracy_score(y_train_true, y_train_pred))
print("  precision:", precision_score(y_train_true, y_train_pred))
print("  recall   :", recall_score(y_train_true, y_train_pred))
print("  f1       :", f1_score(y_train_true, y_train_pred))


NameError: name 'train_chunked' is not defined