In [1]:
import polars as pl
from datasets import Dataset

import torch
import gc
import random
import emoji
import os
import re
import nltk
import markdown
import pandas as pd
from IPython.display import display, HTML
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

from tqdm import tqdm

import warnings
warnings.simplefilter('ignore')

nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/zorin17/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from huggingface_hub import login

# Insert your token here
login(token="")

In [3]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [4]:
df = pl.read_csv('hf://datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')

# Data cleaning
# fill in null
df = df.with_columns([
    pl.col("instruction")
      .cast(str)
      .str.to_lowercase()
      .fill_null(""),
    
    pl.col("response")
      .cast(str)
      .str.to_lowercase()
      .fill_null(""),
    
    pl.col("intent")
      .cast(str)
      .fill_null("unknown")
])

# Remove emoji
def remove_emojis(text: str) -> str:
    return emoji.replace_emoji(text, replace="")  # Remove all emojis safely

# Apply to instruction and response
df = df.with_columns([
    pl.col("instruction").map_elements(remove_emojis).alias("instruction")
])

df = df.with_columns([
    pl.col("response").map_elements(remove_emojis).alias("response")
])

#Exclude noisy flags
# Filter out rows where 'flags' contains Z, Q, or W ===
flag = ["flags"]
df_z = df.filter(
    ~pl.col(flag).cast(str).str.contains("Z")
)

df_zw = df.filter(
    ~pl.col(flag).cast(str).str.contains("Z") &
    ~pl.col(flag).cast(str).str.contains("W")
)

df_clean = df.filter(
    ~pl.col(flag).cast(str).str.contains("Z") &
    ~pl.col(flag).cast(str).str.contains("W") &
    ~pl.col(flag).cast(str).str.contains("Q")
)

print(df_z.height)
print(df_zw.height)
print(df_clean.height)

category_counts = (
    df_clean
    .group_by("category")
    .agg(pl.count().alias("counts"))
    .sort("counts", descending=True)
)

print(category_counts)

# Filter only selected categories
selected_categories = ["ORDER", "REFUND", "SHIPPING", "DELIVERY"]
df_selected = df_clean.filter(
    pl.col("category").is_in(selected_categories)
)


# Split dataset by category
# === Configuration ===
LABEL_COL = "category"  # 🔁 Replace with "intent" or any stratification column
SPLIT_RATIO_TRAIN = 0.7
SPLIT_RATIO_VAL = 0.15
SEED = 123
df_final = df_selected.clone()

# === Stratified split logic ===
random.seed(SEED)
train_parts = []
test_parts = []
val_parts = []

for label in df_final[LABEL_COL].unique().to_list():
    group_df = df_final.filter(pl.col(LABEL_COL) == label)
    group_df = group_df.sample(n=len(group_df), shuffle=True, seed=SEED)

    n = len(group_df)
    train_idx = int(n * SPLIT_RATIO_TRAIN)
    val_idx = int(n * (SPLIT_RATIO_TRAIN + SPLIT_RATIO_VAL))

    train_parts.append(group_df[:train_idx])
    val_parts.append(group_df[train_idx:val_idx])
    test_parts.append(group_df[val_idx:])

# === Combine all groups
train_df = pl.concat(train_parts).sort(["category", "instruction"])
val_df = pl.concat(val_parts).sort(["category", "instruction"])
test_df = pl.concat(test_parts).sort(["category", "instruction"])

rag_data = pl.concat([train_df, val_df], how="vertical")

print("✅ Split sizes:")
print(f"Train: {len(train_df)}")
print(f"Val:   {len(val_df)}")
print(f"Test:  {len(test_df)}")

print(f"RAG:  {len(rag_data)}")

print("\n📊 Category distribution in test set:")
print(test_df.select([pl.col(LABEL_COL)]).to_series().value_counts())

21586
20517
14454
shape: (11, 2)
┌──────────────┬────────┐
│ category     ┆ counts │
│ ---          ┆ ---    │
│ str          ┆ u32    │
╞══════════════╪════════╡
│ ACCOUNT      ┆ 3251   │
│ ORDER        ┆ 2152   │
│ REFUND       ┆ 1527   │
│ SHIPPING     ┆ 1156   │
│ DELIVERY     ┆ 1102   │
│ …            ┆ …      │
│ INVOICE      ┆ 1076   │
│ PAYMENT      ┆ 1028   │
│ FEEDBACK     ┆ 1004   │
│ CANCEL       ┆ 539    │
│ SUBSCRIPTION ┆ 537    │
└──────────────┴────────┘
✅ Split sizes:
Train: 4154
Val:   890
Test:  893
RAG:  5044

📊 Category distribution in test set:
shape: (4, 2)
┌──────────┬───────┐
│ category ┆ count │
│ ---      ┆ ---   │
│ str      ┆ u32   │
╞══════════╪═══════╡
│ DELIVERY ┆ 166   │
│ ORDER    ┆ 323   │
│ REFUND   ┆ 230   │
│ SHIPPING ┆ 174   │
└──────────┴───────┘


In [5]:
# # --- Setup LanceDB with FAQ dataset ---
# embedding_model = get_registry().get("colbert").create(name="colbert-ir/colbertv2.0")

# class FAQModel(LanceModel):
#     """Schema for FAQ vector table"""
#     text: str = embedding_model.SourceField()
#     vector: Vector(embedding_model.ndims()) = embedding_model.VectorField()
#     category: str
#     intent: str


# def create_faq_table(df):
#     """
#     Create a LanceDB table from FAQ dataframe
#     """
#     db = lancedb.connect("/home/zorin17/Desktop/LLM/")
#     table = db.create_table(
#         "LANCEDB_FAQ",
#         schema=FAQModel,
#         mode="overwrite",
#     )
    
#     # Combine question + answer into one text for embedding
#     entries = []
#     for row in df.iter_rows(named=True):   # Polars way
#         entry = {
#             "text": f"Question: {row['instruction']}\nAnswer: {row['response']}",
#             # "text": row['response'],
#             "category": row["category"],
#             "intent": row["intent"],
#         }
#         entries.append(entry)

#     table.add(pd.DataFrame(entries))  # LanceDB expects pandas
#     return table

# # Create the LanceDB table (first time setup only)
# table = create_faq_table(rag_data)

In [6]:
# Read data from Lancedb table
def load_faq_table():
    """
    Load the existing LanceDB FAQ table
    """
    db = lancedb.connect("/home/zorin17/Desktop/LLM/")
    table = db.open_table("LANCEDB_FAQ")
    return table


table = load_faq_table()

In [None]:
# ---------- your models ----------
MODELS = {
    "llama": {
        "base":    "meta-llama/Llama-3.2-1B",
        "adapter": "qlora-outputs/Llama-3.2-1B-faq",
    },
    "qwen": {
        "base":    "Qwen/Qwen3-0.6B-Base",
        "adapter": "qlora-outputs/Qwen3-0.6B-Base-faq",
    },
    "olmo": {
        "base":    "allenai/OLMo-2-0425-1B",
        "adapter": "qlora-outputs/OLMo-2-0425-1B-faq",
    },
}

# choose model key
MODEL_KEY = "qwen"  # "llama" | "qwen" | "olmo"
BASE_MODEL  = MODELS[MODEL_KEY]["base"]
ADAPTER_DIR = MODELS[MODEL_KEY]["adapter"]

USE_CUDA = torch.cuda.is_available()
# load tokenizer + model once
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# --- load base in 4-bit + attach LoRA adapter ---
supports_bf16 = USE_CUDA and torch.cuda.get_device_capability(0)[0] >= 8
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if supports_bf16 else torch.float16,
)

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb,
    device_map="auto",
    attn_implementation="sdpa",
    torch_dtype=torch.bfloat16 if supports_bf16 else torch.float16,
    trust_remote_code=True,
)

hf_model = PeftModel.from_pretrained(base, ADAPTER_DIR)  # <-- attach your QLoRA adapter
# re-enable cache for inference
if getattr(hf_model.config, "use_cache", None) is not True:
    hf_model.config.use_cache = True

USE_CUDA = torch.cuda.is_available()

custom_template = """{% if messages | selectattr('role','equalto','system') | list %}
System: {{ (messages | selectattr('role','equalto','system') | map(attribute='content') | list) | join('\\n') }}
{% endif %}
{% for m in messages %}
{% if m['role'] == 'user' -%}
User: {{ m['content'] }}
{% elif m['role'] == 'assistant' -%}
Assistant: {{ m['content'] }}
{% elif m['role'] == 'tool' -%}
Tool: {{ m['content'] }}
{% elif m['role'] == 'developer' -%}
System: {{ m['content'] }}
{% else -%}
{{ m['role']|capitalize }}: {{ m['content'] }}
{% endif -%}
{% endfor %}
Assistant:"""

# custom_template = """{% for message in messages -%}
# <|im_start|>{{ message['role'] }}
# {{ message['content'] }}<|im_end|>
# {% endfor -%}
# {% if add_generation_prompt %}<|im_start|>assistant
# {% endif %}"""


# --- Search with optional filtering ---
def search(query, table, top_k=5, category=None, intent=None):
    search_obj = table.search(query).limit(top_k)

    # Apply filters using `.where()`
    if category:
        search_obj = search_obj.where(f"category = '{category}'")
    if intent:
        search_obj = search_obj.where(f"intent = '{intent}'")

    result = search_obj.to_list()

    if not result:
        return "[Context 1]:\n(no relevant context found)"
    
    # Format context with citation numbering
    contexts = []
    for i, r in enumerate(result, 1):
        text = r.get("text", "").strip()
        contexts.append(f"[Context {i}]:\n{text}\n")   # <-- with label + line breaks
    return "\n".join(contexts)


def generate(base_prompt, question, context, temperature=0.1, max_new_tokens=512):
    """
    HF-only generator. Builds a single-string prompt and decodes ONLY new tokens
    after the prompt to avoid echoing.
    """

    system_content = f"{base_prompt.format(question, context)}"

    # skip chat template to be fair for other two models
    messages = [
        {"role": "system", "content": system_content},
        {"role": "user", "content": f"Question: {question}\n\nContext: {context}"}
    ]

    # tokenize and move to device

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,  # adds the assistant turn
        chat_template = custom_template
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(hf_model.device)
    
    # inputs = tokenizer(system_content, return_tensors="pt").to(hf_model.device)
    input_len = inputs["input_ids"].shape[-1]

    # generate
    outputs = hf_model.generate(
        **inputs,
        do_sample=(temperature > 0),
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=0.95,
        no_repeat_ngram_size=3,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        return_dict_in_generate=True,
    )

    # decode ONLY the newly generated tokens (exclude prompt)
    new_tokens = outputs.sequences[0, input_len:]
    response_text = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

    # markdown -> HTML (for your existing UI)
    return markdown.markdown(response_text)


# --- Full RAG ---
def rag(question, table, base_prompt, temperature=0.0, category=None, intent=None):
    context = search(question, table, top_k=5, category=category, intent=intent)
    answer = generate(base_prompt, question, context, temperature)
    return answer

In [8]:
def enforce_sentence_structure_multi_p(html_text: str) -> str:
    """
    Capitalize sentences inside multiple <p>...</p> blocks.
    Handles:
    - First letter of each paragraph
    - After punctuation (.!?)
    - After "Step X:"
    - After numbered lists like "1)" or "2."
    - After colons (:)
    - Title Case for all words inside quotes ("..." or '...')
    """

    def title_case_inside_quotes(match):
        # Take the quoted content and apply Title Case
        content = match.group(2)
        content_tc = " ".join(w.capitalize() for w in content.split())
        return match.group(1) + content_tc + match.group(3)

    def fix_text(text: str) -> str:
        text = text.strip()

        # Capitalize very first letter of the paragraph
        text = re.sub(r'^[a-z]', lambda m: m.group(0).upper(), text)

        # Capitalize after punctuation (.!?)
        text = re.sub(r'([.!?]\s+)([a-z])',
                      lambda m: m.group(1) + m.group(2).upper(),
                      text)

        # Capitalize after "Step X:"
        text = re.sub(r'(Step\s*\d+:)(\s*)([a-z])',
                      lambda m: m.group(1) + m.group(2) + m.group(3).upper(),
                      text,
                      flags=re.IGNORECASE)

        # Capitalize after numbered list "1)" or "2."
        text = re.sub(r'(\d+[\)\.]\s*)([a-z])',
                      lambda m: m.group(1) + m.group(2).upper(),
                      text)

        # Capitalize after colon ":"
        text = re.sub(r'(:\s*)([a-z])',
                      lambda m: m.group(1) + m.group(2).upper(),
                      text)

        # Title Case inside double quotes
        text = re.sub(r'(")([^"]+)(")', title_case_inside_quotes, text)

        return text

    def fix_paragraph(match):
        inner = match.group(1).strip()
        return f"<p>{fix_text(inner)}</p>"

    return re.sub(r"<p>(.*?)</p>", fix_paragraph, html_text,
                  flags=re.DOTALL | re.IGNORECASE)


In [11]:
# Define prompt template
base_prompt = """You are a helpful retail assistant. Your task is to answer the user question using provided contexts as the answer. 
You must make your response organized and structured.

User question: {}
Contexts:
{}
"""

# base_prompt = """You are a helpful retail assistant. Your task is to answer the user question using exactly one of the provided contexts as the answer. 
# You must make your response organized and structured.

# User question: {}
# Contexts:
# {}
# """

# Ask a quesition
question = "i want to check my order"
answer = rag(question, table, base_prompt, category="ORDER")

display(HTML(enforce_sentence_structure_multi_p(answer)))

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
