In [3]:
!pip install -r ../requirements.txt

Collecting scikit-learn<1.6,>=1.4 (from -r ../requirements.txt (line 2))
  Using cached scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting torch>=2.2.0 (from -r ../requirements.txt (line 3))
  Using cached torch-2.7.0-cp313-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting transformers>=4.40.0 (from -r ../requirements.txt (line 4))
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets>=2.19.0 (from -r ../requirements.txt (line 5))
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting tqdm>=4.66.0 (from -r ../requirements.txt (line 6))
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting accelerate>=0.29.0 (from -r ../requirements.txt (line 7))
  Using cached accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting peft>=0.10.0 (from -r ../requirements.txt (line 8))
  Using cached peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting mlflow>=2.10.0 (from -r ../requiremen

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer
from datasets import Dataset as HFDataset

KAGGLE_DATA_PATH = "Base_data/training.csv"
MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 128
VAL_SIZE = 0.15
TEST_SIZE = 0.1
SAMPLE_SIZE = 3000 

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df = pd.read_csv(KAGGLE_DATA_PATH)

In [6]:
df.groupby(["label"])["label"].value_counts()

label
0    4666
1    5362
2    1304
3    2159
4    1937
5     572
Name: count, dtype: int64

In [7]:
df[df["label"]==5]["text"][6]

'ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny'

In [8]:
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


## a classification label, with possible values including sadness (0), joy (1), love (2), anger (3), fear (4).

In [9]:
 # set to an int (e.g., 4000) to subsample


def load_data():
    df = pd.read_csv(KAGGLE_DATA_PATH)

    if SAMPLE_SIZE:
        df = df.sample(n=SAMPLE_SIZE, random_state=42)

    labels = sorted(df["label"].unique())
    label2id = {l: i for i, l in enumerate(labels)}
    id2label = {i: l for l, i in label2id.items()}
    df["label"] = df["label"].map(label2id)

    train_val, test = train_test_split(
        df,
        test_size=TEST_SIZE,
        stratify=df["label"],
        random_state=42,
    )
    train_df, val_df = train_test_split(
        train_val,
        test_size=VAL_SIZE,
        stratify=train_val["label"],
        random_state=42,
    )

    tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

    def tokenize(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=MAX_LENGTH,
        )

    train_ds = HFDataset.from_pandas(train_df[["text", "label"]]).map(tokenize, batched=True)
    val_ds = HFDataset.from_pandas(val_df[["text", "label"]]).map(tokenize, batched=True)
    test_ds = HFDataset.from_pandas(test[["text", "label"]]).map(tokenize, batched=True)

    for ds in (train_ds, val_ds, test_ds):
        ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    return train_ds, val_ds, test_ds, len(labels), label2id, id2label



# train_ds, val_ds, test_ds, num_labels, label2id, id2label = load_data()
# print(f"Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}")


In [10]:
(
    train_tokenized,
    val_tokenized,
    unseen_test_tokenized,
    num_labels,
    label2id,
    id2label
) = load_data()  


Map: 100%|██████████| 2295/2295 [00:00<00:00, 7354.12 examples/s]
Map: 100%|██████████| 405/405 [00:00<00:00, 7819.72 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 7797.07 examples/s]


In [None]:
from datasets import load_from_disk
import json, os

base_dir = "data"                   
os.makedirs(base_dir, exist_ok=True)

train_tokenized.save_to_disk(f"{base_dir}/train")
val_tokenized.save_to_disk(f"{base_dir}/val")
unseen_test_tokenized.save_to_disk(f"{base_dir}/test")

clean_id2label = {int(k): v for k, v in id2label.items()}   
clean_label2id = {str(k): int(v) for k, v in label2id.items()} 

import json, numpy as np, os

def to_py(obj):
    """Recursively cast numpy scalars to plain Python."""
    if isinstance(obj, dict):
        return {to_py(k): to_py(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [to_py(x) for x in obj]
    if isinstance(obj, np.generic):         
        return obj.item()                   
    return obj                               

payload = {
    "label2id": label2id,
    "id2label": id2label,
    "num_labels": num_labels
}
payload = to_py(payload)                     
os.makedirs(base_dir, exist_ok=True)
with open(f"{base_dir}/label_maps.json", "w") as fp:
    json.dump(payload, fp, indent=2)
train_ds   = load_from_disk(f"{base_dir}/train")
val_ds     = load_from_disk(f"{base_dir}/val")
test_ds    = load_from_disk(f"{base_dir}/test")

with open(f"{base_dir}/label_maps.json") as fp:
    maps = json.load(fp)
label2id  = maps["label2id"]
id2label  = maps["id2label"]
num_labels = maps["num_labels"]


Saving the dataset (1/1 shards): 100%|██████████| 2295/2295 [00:00<00:00, 914056.37 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 405/405 [00:00<00:00, 265919.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 300/300 [00:00<00:00, 209785.13 examples/s]
