In [12]:
import json
import os
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Constants
DATA_PATH = "../data/intent_classifier_dataset.json"
SAVE_DIR = "../data/"
TEST_SIZE = 0.2
RANDOM_SEED = 42


In [13]:
def load_dataset(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

# 🔄 Load dataset
data = load_dataset(DATA_PATH)
print(f"Loaded {len(data)} samples")


Loaded 1238 samples


In [14]:
def to_dataframe(data):
    return pd.DataFrame(data)

df = to_dataframe(data)
df.head()


Unnamed: 0,text,label
0,What is the process for requesting maternity l...,needs_rag
1,Can you explain the reimbursement policy for t...,needs_rag
2,How do I access my employee benefits information?,needs_rag
3,Are there any guidelines for remote work durin...,needs_rag
4,What are the criteria for the annual performan...,needs_rag


In [15]:
def encode_labels(df):
    encoder = LabelEncoder()
    df["label_id"] = encoder.fit_transform(df["label"])
    return df, encoder

df, encoder = encode_labels(df)
df.head()


Unnamed: 0,text,label,label_id
0,What is the process for requesting maternity l...,needs_rag,0
1,Can you explain the reimbursement policy for t...,needs_rag,0
2,How do I access my employee benefits information?,needs_rag,0
3,Are there any guidelines for remote work durin...,needs_rag,0
4,What are the criteria for the annual performan...,needs_rag,0


In [16]:
def split_dataset(df, test_size=0.2):
    return train_test_split(df, test_size=test_size, stratify=df["label_id"], random_state=RANDOM_SEED)

train_df, test_df = split_dataset(df, test_size=TEST_SIZE)
print(f"Train: {len(train_df)}, Test: {len(test_df)}")
def split_dataset(df, test_size=0.2):
    return train_test_split(df, test_size=test_size, stratify=df["label_id"], random_state=RANDOM_SEED)

train_df, test_df = split_dataset(df, test_size=TEST_SIZE)
print(f"Train: {len(train_df)}, Test: {len(test_df)}")


Train: 990, Test: 248
Train: 990, Test: 248


In [17]:
def save_to_json(df, path):
    records = df[["text", "label", "label_id"]].to_dict(orient="records")
    with open(path, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=2, ensure_ascii=False)

save_to_json(train_df, os.path.join(SAVE_DIR, "train_split.json"))
save_to_json(test_df, os.path.join(SAVE_DIR, "test_split.json"))
print("✅ Train and Test splits saved.")


✅ Train and Test splits saved.


In [18]:
def save_label_mapping(encoder, path="../data/label_mapping.json"):
    mapping = {
        str(cls): int(idx) for cls, idx in zip(encoder.classes_, encoder.transform(encoder.classes_))
    }
    with open(path, "w", encoding="utf-8") as f:
        json.dump(mapping, f, indent=2)


save_label_mapping(encoder)
print("📌 Label mapping saved.")


📌 Label mapping saved.
