In [1]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score
from sklearn.preprocessing import LabelEncoder
from time import sleep
import pandas as pd
import requests
import json
import re
import os

In [2]:
API_KEYS = {
    "mistral": "daTaONGS6GBzuX9b2OqgqZ41vtsA2GTu"
}

In [3]:
MODEL_NAMES = {
    "mistral": "mistral-medium",
}

In [4]:
INITIAL_INPUT = "data/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv"
INPUT_CSV = "outputs/questions_only.csv"
LABELS_JSON = "outputs/labels_for_evaluation.json"

# Ensure outputs directory exists
os.makedirs("outputs", exist_ok=True)

# Load the dataset
df = pd.read_csv(INITIAL_INPUT)

# 1. Save LLM input: only instruction column
questions_only = df[["instruction"]].dropna()
questions_only.to_csv(INPUT_CSV, index=False)

# 2. Save label reference: instruction + intent + category
label_reference = df[["instruction", "intent", "category"]].dropna()
label_reference.to_json(LABELS_JSON, orient="records", indent=2, force_ascii=False)

# Confirmation
print("✅ Done.")
print(f"LLM input saved to → {INPUT_CSV}")
print(f"Label reference saved to → {LABELS_JSON}")


✅ Done.
LLM input saved to → outputs/questions_only.csv
Label reference saved to → outputs/labels_for_evaluation.json


In [5]:
INITIAL_INPUT = "data/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv"
INPUT_CSV = "outputs/questions_only.csv"
PROMPT_DIR = "prompts"
BATCH_SIZE = 1350

In [6]:
OUTPUT_JSONS = {
    "mistral": [
        "outputs/Mistral_prompt1_final_cluster_summary.json",
        "outputs/Mistral_prompt2_final_cluster_summary.json",
        "outputs/Mistral_prompt3_final_cluster_summary.json"
    ]
}

In [7]:
def read_prompt(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


In [8]:
def parse_cluster_response(text):
    pattern = r"\d+\.\s*(.*?)\n(.*?)\nCount:\s*(\d+)"
    matches = re.findall(pattern, text.strip())
    parsed = []
    for name, description, count in matches:
        desc_clean = re.sub(r"^(Description of the group:|Description:)\s*", "", description.strip())
        parsed.append({
            "name": name.strip(),
            "description": desc_clean,
            "count": int(count)
        })
    return parsed

In [9]:
def call_mistral(prompt, api_key):
    url = "https://api.mistral.ai/v1/chat/completions"
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    payload = {
        "model": MODEL_NAMES["mistral"],
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.7
    }
    r = requests.post(url, headers=headers, json=payload)
    r.raise_for_status()
    return r.json()["choices"][0]["message"]["content"]

In [10]:
df = pd.read_csv(INPUT_CSV)
questions = df["instruction"].dropna().astype(str).tolist()

In [11]:
CALL_FUNCTIONS = {
    "mistral": call_mistral
}

In [12]:
for model in ["mistral"]:
    for prompt_index in range(3):
        prompt_path = os.path.join(PROMPT_DIR, f"prompt{prompt_index+1}.txt")
        prompt_template = read_prompt(prompt_path)
        output_path_json = OUTPUT_JSONS[model][prompt_index]
        output_path_csv = output_path_json.replace(".json", ".csv")

        merged_clusters = {}
        assignment_rows = []

        for i in range(0, len(questions), BATCH_SIZE):
            batch = questions[i:i+BATCH_SIZE]
            question_block = "\n".join(f"{j+1}. {q}" for j, q in enumerate(batch))
            prompt = prompt_template.replace("{{QUESTIONS}}", question_block)

            print(f"🔄 {model.upper()} | Prompt {prompt_index+1} | Batch {i//BATCH_SIZE + 1}")
            try:
                response = CALL_FUNCTIONS[model](prompt, API_KEYS[model])
                parsed = parse_cluster_response(response)

                cluster_pointer = 0
                for cluster in parsed:
                    name = cluster["name"]
                    desc = cluster["description"]
                    count = cluster["count"]

                    if name in merged_clusters:
                        merged_clusters[name]["count"] += count
                    else:
                        merged_clusters[name] = cluster

                    for _ in range(count):
                        if cluster_pointer >= len(batch):
                            break
                        assignment_rows.append({
                            "instruction": batch[cluster_pointer],
                            "name": name,
                            "description": desc
                        })
                        cluster_pointer += 1

            except Exception as e:
                print(f"❌ {model.upper()} Prompt {prompt_index+1} Batch {i//BATCH_SIZE + 1} failed: {e}")
            sleep(1)

        # Save final cluster summary (JSON)
        with open(output_path_json, "w", encoding="utf-8") as f:
            json.dump(list(merged_clusters.values()), f, indent=2, ensure_ascii=False)

        # Save cluster assignments (CSV)
        pd.DataFrame(assignment_rows).to_csv(output_path_csv, index=False)

        print(f"✅ Saved: {output_path_json}")
        print(f"📄 Saved: {output_path_csv}")


🔄 MISTRAL | Prompt 1 | Batch 1
🔄 MISTRAL | Prompt 1 | Batch 2
🔄 MISTRAL | Prompt 1 | Batch 3
🔄 MISTRAL | Prompt 1 | Batch 4
🔄 MISTRAL | Prompt 1 | Batch 5
🔄 MISTRAL | Prompt 1 | Batch 6
🔄 MISTRAL | Prompt 1 | Batch 7
🔄 MISTRAL | Prompt 1 | Batch 8
🔄 MISTRAL | Prompt 1 | Batch 9
🔄 MISTRAL | Prompt 1 | Batch 10
🔄 MISTRAL | Prompt 1 | Batch 11
🔄 MISTRAL | Prompt 1 | Batch 12
🔄 MISTRAL | Prompt 1 | Batch 13
🔄 MISTRAL | Prompt 1 | Batch 14
🔄 MISTRAL | Prompt 1 | Batch 15
🔄 MISTRAL | Prompt 1 | Batch 16
🔄 MISTRAL | Prompt 1 | Batch 17
🔄 MISTRAL | Prompt 1 | Batch 18
🔄 MISTRAL | Prompt 1 | Batch 19
🔄 MISTRAL | Prompt 1 | Batch 20
✅ Saved: outputs/Mistral_prompt1_final_cluster_summary.json
📄 Saved: outputs/Mistral_prompt1_final_cluster_summary.csv
🔄 MISTRAL | Prompt 2 | Batch 1
🔄 MISTRAL | Prompt 2 | Batch 2
🔄 MISTRAL | Prompt 2 | Batch 3
🔄 MISTRAL | Prompt 2 | Batch 4
🔄 MISTRAL | Prompt 2 | Batch 5
🔄 MISTRAL | Prompt 2 | Batch 6
🔄 MISTRAL | Prompt 2 | Batch 7
🔄 MISTRAL | Prompt 2 | Batch 8
🔄 

In [13]:
prompt_paths = [
    "outputs/Mistral_prompt1_final_cluster_summary.csv",
    "outputs/Mistral_prompt2_final_cluster_summary.csv",
    "outputs/Mistral_prompt3_final_cluster_summary.csv"
]

true_data_path = "data/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv"


In [14]:
true_df = pd.read_csv(true_data_path)
true_df = true_df.dropna(subset=["instruction", "intent"])
true_df = true_df[["instruction", "intent"]].drop_duplicates()

In [15]:
results = []

for i, path in enumerate(prompt_paths, start=1):
    try:
        pred_df = pd.read_csv(path)
        pred_df = pred_df.dropna(subset=["instruction", "name"])
        pred_df = pred_df[["instruction", "name"]].drop_duplicates()

        # Merge on instruction
        merged = pd.merge(true_df, pred_df, on="instruction", how="inner")
        merged.columns = ["instruction", "true_label", "predicted_label"]

        # Encode as integers (just for grouping)
        le_true = LabelEncoder()
        le_pred = LabelEncoder()

        cluster_true = le_true.fit_transform(merged["true_label"])
        cluster_pred = le_pred.fit_transform(merged["predicted_label"])

        # Calculate metrics
        ari = adjusted_rand_score(cluster_true, cluster_pred)
        nmi = normalized_mutual_info_score(cluster_true, cluster_pred)
        fmi = fowlkes_mallows_score(cluster_true, cluster_pred)

        results.append({
            "Prompt": f"Prompt {i}",
            "Sample Size": len(merged),
            "ARI": ari,
            "NMI": nmi,
            "FMI": fmi
        })

    except Exception as e:
        results.append({
            "Prompt": f"Prompt {i}",
            "Sample Size": 0,
            "ARI": None,
            "NMI": None,
            "FMI": None,
            "Error": str(e)
        })

In [16]:
results_df = pd.DataFrame(results)
print(results_df)

     Prompt  Sample Size       ARI       NMI       FMI
0  Prompt 1         9990  0.608613  0.862463  0.630986
1  Prompt 2        10413  0.575001  0.852295  0.600736
2  Prompt 3         9384  0.493308  0.835502  0.550257
