# Transformers - RAG shape (High-Level)

**Intent triage (classification) and policy summarization with grounding (placeholders)**

RAG(Retrieval-Augmented Generation) is an AI architecture that enhances large language models (LLMs) by providing them with access to external, up-to-date knowledge bases, making their responses more accurate, current, and specific to a given context without requiring the model to be retrained.

In [None]:
import os, json, math, random, time, textwrap
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt, warnings
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

np.random.seed(42)
random.seed(42)
warnings.filterwarnings("ignore")

ART = Path("artifacts"); ART.mkdir(exist_ok=True)
print("Artifacts ->", ART.resolve())

## Lightweight intent triage with bag‑of‑words (encoder proxy)

In [None]:
# Synthetic labeled tickets
texts = [
    "Where is my order? tracking shows no movement",
    "Need to change my shipping address asap",
    "The boots are defective and I want a refund",
    "What does the warranty cover for the tent poles?",
    "My package arrived late and box is damaged",
    "Cancel my order please before it ships",
]
labels = ["Shipping","Order Changes","Returns","Warranty","Shipping","Order Changes"]
more = ["The seam ripped after one hike", "Exchange for a different size", "Return label please", "How long do returns take?"]
labels_more = ["Returns","Order Changes","Returns","Returns"]
texts = texts + more; labels = labels + labels_more

vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
X = vec.fit_transform(texts)
y = np.array(labels)
clf = LogisticRegression(max_iter=300).fit(X, y)

test = ["Need to exchange the boots due to size", "When will my package arrive?", "Does warranty include poles?"]
Xt = vec.transform(test); pred = clf.predict(Xt)
for t,p in zip(test, pred): print(p, "—", t)
print(classification_report(y, clf.predict(X)))
cm = confusion_matrix(y, clf.predict(X), labels=sorted(set(y)))
print("Confusion\n", cm)

## Grounded summarization placeholder (RAG shape w/o LLM)

In [None]:
# Simple retrieval over policy paragraphs using TF-IDF cosine
policy_docs = [
    ("returns", "Customers may return items within 30 days of delivery. Items must be unworn and in original packaging. Refunds processed to the original method."),
    ("warranty", "Footwear is covered for manufacturing defects for 1 year. Normal wear and tear, mis-use, and modifications are excluded."),
    ("shipping", "Standard shipping delivers in 3-5 business days within the continental US. Tracking updates may be delayed by carriers.")
]
corpus = [p for k,p in policy_docs]
q = "What is the warranty for boots?"

vec = TfidfVectorizer().fit(corpus + [q])
C = vec.transform(corpus); Q = vec.transform([q])
sims = cosine_similarity(Q, C).ravel()
best_idx = int(np.argmax(sims)); best_doc = corpus[best_idx]
print("Query:", q); print("Top passage:", best_doc)