Cleaning the dataset

In [70]:
import json
import os
import kagglehub

# Load credentials from kaggle.json
with open('kaggle.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

os.environ['KAGGLE_USERNAME'] = data['username']
os.environ['KAGGLE_KEY'] = data['key']

# Donwload CLINIC150 DATASET FROM KAGGLE
print("Path to dataset files:", kagglehub.dataset_download("hongtrung/clinc150-dataset"))


Path to dataset files: /home/josue/.cache/kagglehub/datasets/hongtrung/clinc150-dataset/versions/1


In [58]:
import shutil
from pathlib import Path

# Define paths
source = Path.home() / ".cache" / "kagglehub" / "datasets" / "hongtrung" / "clinc150-dataset" / "versions" / "1" / "data"
destination = Path("dataset")

# Create destination folder if it doesn't exist
destination.mkdir(parents=True, exist_ok=True)

# Only copy data_full.json
file_to_copy = source / "data_full.json"
if file_to_copy.exists():
    shutil.copy(file_to_copy, destination / "data_full.json")
    print(f"Copied {file_to_copy.name} to {destination.resolve()}")
else:
    print("data_full.json not found in source folder.")


Copied data_full.json to /home/josue/llm-intent-distilled-benchmark/dataset


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Load full dataset
with open("dataset/data_full.json", "r", encoding="utf-8") as f:
    full_data = json.load(f)

# Combine train, val, and test (excluding "oos")
combined_data = []
for split_name in ["train", "val", "test"]:
    for text, intent in full_data[split_name]:
        if intent != "oos":
            combined_data.append({"text": text, "intent": intent})

# Create DataFrame
df = pd.DataFrame(combined_data)

# Stratified split into 70% train and 30% test
train_df, test_df = train_test_split(
    df, test_size=0.3, stratify=df["intent"], random_state=42
)

# Sort by intent so examples are grouped
train_df = train_df.sort_values(by="intent").reset_index(drop=True)
test_df = test_df.sort_values(by="intent").reset_index(drop=True)

# Convert to required JSON format: list of {"text": str, "intent": str}
train_list = train_df.to_dict(orient="records")
test_list = test_df.to_dict(orient="records")

# Save to JSON files
train_path = "dataset/train.json"
test_path = "dataset/test.json"

with open(train_path, "w", encoding="utf-8") as f:
    json.dump(train_list, f, ensure_ascii=False, indent=2)

with open(test_path, "w", encoding="utf-8") as f:
    json.dump(test_list, f, ensure_ascii=False, indent=2)

print(f"Train saved to: {train_path}")
print(f"Test saved to: {test_path}")


Train saved to: dataset/train.json
Test saved to: dataset/test.json


In [60]:
# train 150 intents and 105 training phrases per intent
# Count and show only the table of training samples per intent
train_df["intent"].value_counts().sort_index().reset_index(name="count").rename(columns={"index": "intent"})

Unnamed: 0,intent,count
0,accept_reservations,105
1,account_blocked,105
2,alarm,105
3,application_status,105
4,apr,105
...,...,...
145,where_are_you_from,105
146,whisper_mode,105
147,who_do_you_work_for,105
148,who_made_you,105


In [61]:
# test 150 intents and 45 phrases per intent
# Count and show only the table of test samples per intent
test_df["intent"].value_counts().sort_index().reset_index(name="count").rename(columns={"index": "intent"})

Unnamed: 0,intent,count
0,accept_reservations,45
1,account_blocked,45
2,alarm,45
3,application_status,45
4,apr,45
...,...,...
145,where_are_you_from,45
146,whisper_mode,45
147,who_do_you_work_for,45
148,who_made_you,45


#### Transform the JSON train dataset to .yml for rasa

In [None]:
from pathlib import Path

# Group training data by intent
grouped = train_df.groupby("intent")["text"].apply(list).reset_index()

# Start building the nlu.yml structure
output_lines = ["version: '2.0'", "", "nlu:"]

# Format each intent and its examples
for _, row in grouped.iterrows():
    intent = row["intent"]
    examples = row["text"]

    output_lines.append(f"- intent: {intent}")
    output_lines.append("  examples: |")
    for ex in examples:
        output_lines.append(f"    - {ex}")
    output_lines.append("")  # Blank line between intents

# Save the file
RASA_NLU = "dataset/nlu.yml"
RASA_PATH = "rasa-inference/data/nlu.yml"

output_path = Path(RASA_NLU)
output_path.write_text("\n".join(output_lines), encoding="utf-8")

shutil.copy(RASA_NLU, RASA_PATH)
print(f"nlu.yml saved at: {output_path.resolve()}")
print(f"Rasa nlu.yml saved at: {RASA_PATH.resolve()}")


nlu.yml saved at: /home/josue/llm-intent-distilled-benchmark/dataset/nlu.yml
Rasa nlu.yml saved at: /home/josue/llm-intent-distilled-benchmark/rasa-inference/data/nlu.yml


In [73]:
# Calculate statistics
num_intents = train_df["intent"].nunique()
examples_per_intent_train = train_df.groupby("intent").size().iloc[0]
examples_per_intent_test = test_df.groupby("intent").size().iloc[0]
total_train = len(train_df)
total_test = len(test_df)
total = total_train + total_test
train_pct = round((total_train / total) * 100, 2)
test_pct = round((total_test / total) * 100, 2)

# Generate report in English
report = f"""
📊 Summary of the CLINC150 Dataset Distribution (pre-processed)

- The dataset contains {num_intents} intents, with a total of {examples_per_intent_train + examples_per_intent_test} examples per intent.
- The dataset was stratified and split into two subsets:
  - Training: {examples_per_intent_train} examples per intent, totaling {total_train} examples.
  - Test: {examples_per_intent_test} examples per intent, totaling {total_test} examples.
- The split ratio is {train_pct}% for training and {test_pct}% for testing, while maintaining balanced class distribution.

✅ Relevant notes for the experiment:

- No "out-of-scope" (oos) intents were included in this version of the dataset.
- The stratified split ensures that all intents are equally represented in both training and test sets.
- The dataset was structured to allow fair comparisons between different intent classification approaches.
"""

print(report)


📊 Summary of the CLINC150 Dataset Distribution (pre-processed)

- The dataset contains 150 intents, with a total of 150 examples per intent.
- The dataset was stratified and split into two subsets:
  - Training: 105 examples per intent, totaling 15750 examples.
  - Test: 45 examples per intent, totaling 6750 examples.
- The split ratio is 70.0% for training and 30.0% for testing, while maintaining balanced class distribution.

✅ Relevant notes for the experiment:

- No "out-of-scope" (oos) intents were included in this version of the dataset.
- The stratified split ensures that all intents are equally represented in both training and test sets.
- The dataset was structured to allow fair comparisons between different intent classification approaches.

