# AI-Powered Customer Support Chatbot

This project implements an intelligent conversational assistant for
customer support using NLP techniques, web scraping, and a Flask-based API.

Features:
- Domain-specific question answering
- Follow-up question handling
- Intent detection
- Product scraping and indexing
- Real-time WebSocket communication

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import re
import os
import math
from datetime import datetime
import numpy as np

# NLP imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# URL to Name
from urllib.parse import urlparse

In [2]:
def url_to_name(url):
    parsed = urlparse(url)

    # ---- Extract domain safely ----
    netloc = parsed.netloc

    # Remove common subdomains like www, m, shop, app, etc.
    parts = netloc.split(".")
    if parts[0] in ["www", "m", "app", "shop"]:
        parts = parts[1:]  # remove prefix
    
    # domain name (before TLD)
    domain = parts[0]

    # ---- Extract path parts ----
    path_parts = [p for p in parsed.path.split('/') if p]

    # ---- Build final name ----
    if path_parts:
        return domain + "_" + "_".join(path_parts)
    else:
        return domain

In [3]:
# ----------------- USER CONFIG -----------------
BASE_URL = "https://gangslifestyle.com/"

In [4]:
JSON_ENDPOINT = f"{BASE_URL}/products.json"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
}
name = url_to_name(BASE_URL);
OUTPUT_CSV = f"{name}.csv"    

# Preprocessing config (from your Code 2)
CATEGORY_COVERAGE_THRESHOLD = 0.5
VARIANT_COLOR_COMBINE = True
SUMMARY_SENTENCES = 1
# ------------------------------------------------

# Ensure resources for NLTK (will check & download if needed)
nltk_data = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']
for r in nltk_data:
    try:
        nltk.data.find(r)
    except Exception:
        nltk.download(r)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
# ----------------- SCRAPER (variant-mode) -----------------
def scrape_shopify_products(json_endpoint=JSON_ENDPOINT, headers=HEADERS, sleep_sec=1):
    all_variants_data = []
    page = 1
    print("--- Starting Scrape (Shopify JSON - Detailed Variant Mode) ---")
    while True:
        url = f"{json_endpoint}?page={page}&limit=250"
        print(f"Fetching Page {page}: {url}")
        try:
            response = requests.get(url, headers=headers, timeout=20)
            if response.status_code != 200:
                print(f"  Status code {response.status_code}. Stopping.")
                break
            data = response.json()
            if 'products' not in data or not data['products']:
                print("  No more products found in JSON. Crawl finished.")
                break

            products = data['products']
            print(f"  Found {len(products)} products on page {page} ...")

            for product in products:
                try:
                    product_title = product.get('title', 'N/A')
                    handle = product.get('handle', '')
                    vendor = product.get('vendor', 'N/A')
                    category = product.get('product_type', 'N/A')

                    raw_html = product.get('body_html', '')
                    if raw_html:
                        soup = BeautifulSoup(raw_html, 'html.parser')
                        functional_details = soup.get_text(separator=' ', strip=True)
                    else:
                        functional_details = "N/A"

                    # tags: product['tags'] may be a string or list depending on store — handle both
                    tags_val = product.get('tags', '')
                    if isinstance(tags_val, list):
                        tags = ', '.join(tags_val)
                    else:
                        # sometimes Shopify returns a single string with commas
                        tags = tags_val if tags_val else ''

                    main_image_url = "N/A"
                    if product.get('images'):
                        # images is list of dicts with 'src'
                        try:
                            if isinstance(product['images'], list) and product['images']:
                                main_image_url = product['images'][0].get('src', 'N/A')
                            else:
                                main_image_url = product['images']
                        except Exception:
                            main_image_url = "N/A"

                    for variant in product.get('variants', []):
                        variant_title = variant.get('title', 'N/A')
                        variant_id = variant.get('id')
                        price = variant.get('price', 'N/A')
                        original_price = variant.get('compare_at_price')  # "was" price
                        sku = variant.get('sku', 'N/A')
                        available = variant.get('available', False)

                        link = f"{BASE_URL}/products/{handle}?variant={variant_id}" if handle and variant_id else f"{BASE_URL}"

                        if original_price and original_price != price:
                            discount_info = f"Was {original_price}"
                        else:
                            discount_info = "No Discount"

                        all_variants_data.append({
                            'Product Name': product_title,
                            'Variant Name': variant_title,
                            'SKU': sku,
                            'In Stock?': available,
                            'Price': price,
                            'Original Price': original_price if original_price else "",
                            'Discount Info': discount_info,
                            'Vendor (Brand)': vendor,
                            'Category': category,
                            'Tags': tags,
                            'Functional Details': functional_details,
                            'Link': link,
                            'Main Image URL': main_image_url
                        })

                except Exception as e:
                    print(f"  Error parsing product '{product.get('title', 'unknown')}': {e}")

            page += 1
            time.sleep(sleep_sec)

        except Exception as e:
            print(f"  Error fetching URL or parsing JSON: {e}")
            break

    return pd.DataFrame(all_variants_data)

In [6]:
# ----------------- PREPROCESSING PIPELINE (adapted from Code 2) -----------------
# Helpers (kept from your code)
_currency_re = re.compile(r'[^\d.,\-]+')
def parse_price(v):
    if pd.isna(v):
        return np.nan
    s = str(v).strip()
    if s == '' or s.lower() in ['nan','none','null']:
        return np.nan
    s = _currency_re.sub('', s).replace(',', '')
    try:
        return float(s)
    except:
        nums = re.findall(r'[-+]?\d*\.\d+|\d+', s)
        return float(nums[0]) if nums else np.nan

In [7]:
def normalize_stock(v):
    if pd.isna(v): return 'Unknown'
    s = str(v).strip().lower()
    if s in ['true','yes','1','in stock','available','instock', 'True', 'TRUE', '1']: return 'In Stock'
    if s in ['false','no','0','out of stock','sold out','not available', 'False', 'FALSE', '0']: return 'Out of Stock'
    return 'Unknown'

In [8]:
MARKETING_WORDS = {'buy now','best','new','free shipping','hot','sale','discount','offer','trending'}
def clean_title(t):
    if pd.isna(t): return ''
    s = re.sub(r'\s+', ' ', str(t).strip())
    for w in MARKETING_WORDS:
        # case-insensitive whole-word removal
        s = re.sub(r'(?i)\b' + re.escape(w) + r'\b', '', s)
    return re.sub(r'\s+', ' ', s).strip()

In [9]:
COLOR_WORDS = {'black','white','red','blue','green','yellow','pink','orange','purple','brown','grey','gray','silver','gold','navy'}
def variant_looks_like_color(v):
    if pd.isna(v) or str(v).strip() == '': return False
    parts = re.split(r'[,/;|-]+', str(v).lower())
    return any(p.strip() in COLOR_WORDS for p in parts)

In [10]:
def clean_functional_text(txt):
    if pd.isna(txt): return ''
    s = re.sub(r'<[^>]+>', ' ', str(txt))
    s = re.sub(r'[\r\n\t]+', ' ', s)
    return re.sub(r'\s+', ' ', s).strip()

In [11]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [12]:
def lemmatize_text(text):
    # safe guard: if text empty
    if not isinstance(text, str) or text.strip() == '':
        return ''
    words = word_tokenize(text)
    return ' '.join(lemmatizer.lemmatize(w.lower()) for w in words if w.isalnum())

In [13]:
def extractive_summary(text, n_sentences=1):
    if not isinstance(text, str) or not text.strip():
        return ''
    sents = sent_tokenize(text)
    if len(sents) <= n_sentences:
        return ' '.join(sents)
    try:
        vec = TfidfVectorizer(stop_words='english')
        X = vec.fit_transform(sents)
        centroid = X.sum(axis=0)
        scores = X.dot(centroid.T).A.ravel()
        idx = scores.argsort()[::-1][:n_sentences]
        idx = sorted(idx)
        return ' '.join(sents[i].strip() for i in idx)
    except Exception:
        return sents[0]

In [14]:
def clean_url(u):
    if pd.isna(u): return ''
    s = str(u).strip()
    return re.sub(r'[\?&]$', '', s)

In [15]:
def looks_like_image_url(u):
    if pd.isna(u): return False
    return bool(re.search(r'\.(jpg|jpeg|png|webp|gif)$', str(u), flags=re.I))

In [16]:
def preprocess_variants_df(df_raw):
    """
    Accepts raw DataFrame similar to Code 1 output and returns cleaned DataFrame similar to Code 2 output.
    """
    if df_raw is None or df_raw.shape[0] == 0:
        print("No raw data to preprocess.")
        return pd.DataFrame()

    df = df_raw.copy()

    # Standardize column names if needed (allow both exact names and a few variants)
    # Map expected input columns to short keys used inside pipeline.
    col_map = {
        'Product Name': 'title',
        'Variant Name': 'variant',
        'SKU': 'sku',
        'In Stock?': 'instock',
        'Price': 'price',
        'Original Price': 'original_price',
        'Discount Info': 'discount_info',
        'Category': 'category',
        'Tags': 'tags',
        'Functional Details': 'functional',
        'Link': 'product_url',
        'Main Image URL': 'image_url'
    }

    # Create working frame with keys from col_map
    working = pd.DataFrame()
    for short, long in col_map.items():
        if short in df.columns:
            working[long] = df[short].fillna('')
        else:
            # if expected column not present, create empty series
            working[long] = ''

    # 1. Clean title
    working['title'] = working['title'].apply(clean_title)

    # 2. Combine variant into title when variant looks like a color
    if VARIANT_COLOR_COMBINE:
        # ensure we have 'variant' present
        working['variant'] = working.get('variant', '')
        working['title'] = working.apply(
            lambda r: f"{r['title']} (Color: {r['variant']})" if variant_looks_like_color(r['variant']) else r['title'],
            axis=1
        )

    # 3. SKU fallback
    working['sku'] = working['sku'].astype(str).str.strip()
    missing = working['sku'] == ''
    if missing.any():
        working.loc[missing, 'sku'] = [f"MISSINGSKU_{i}" for i in range(1, missing.sum() + 1)]

    # 4. Stock normalize
    working['stock_status'] = working['instock'].apply(normalize_stock)

    # 5. Price parsing
    working['price_parsed'] = working['price'].apply(parse_price)
    working['original_price_parsed'] = working['original_price'].apply(parse_price)

    def compute_prices(r):
        p = r['price_parsed']
        o = r['original_price_parsed']
        if pd.isna(o) or o == 0:
            o = p
        if pd.isna(p) and not pd.isna(o):
            p = o
        if pd.isna(p) or pd.isna(o):
            disc = np.nan
        else:
            disc = 0 if o == p else round((o - p) / o * 100, 1) if o > p else 0
        return pd.Series([p, o, disc])

    working[['price_current', 'price_original', 'discount_percent']] = working.apply(compute_prices, axis=1)

    # 6. Category/tags remove if < threshold
    def keep(col):
        filled = (working[col].astype(str).str.strip() != '').sum()
        return (filled / len(working)) >= CATEGORY_COVERAGE_THRESHOLD

    if not keep('category'):
        if 'category' in working.columns:
            working.drop(columns=['category'], inplace=True)

    if not keep('tags'):
        if 'tags' in working.columns:
            working.drop(columns=['tags'], inplace=True)

    # 7. Clean functional description
    working['long_description'] = working['functional'].apply(clean_functional_text)

    # 8. Summary
    working['summary'] = working['long_description'].apply(lambda t: extractive_summary(t, SUMMARY_SENTENCES))

    # 9. Lemmatized text (safe-guard for heavy computation)
    working['indexed_text_lemma'] = working.apply(
        lambda r: lemmatize_text(str(r.get('title','')) + ' ' + str(r.get('summary','')) + ' ' + str(r.get('long_description',''))),
        axis=1
    )

    # 10. Clean URL
    working['product_url'] = working['product_url'].apply(clean_url)

    # 11. Image URL rule (keep or drop)
    img_frac = working['image_url'].apply(looks_like_image_url).mean()
    if 'image_url' in working.columns and img_frac > 0.99:
        working.drop(columns=['image_url'], inplace=True)

    # 12. Final search content
    working['search_content'] = (
        working.get('title', '') + " " + working.get('summary', '') + " " + working.get('long_description', '')
    )

    # Final export columns (NO vendor, NO notes)
    export_cols = [
        'sku','title','price_current','price_original','discount_percent',
        'stock_status','summary','long_description','search_content',
        'indexed_text_lemma','product_url'
    ]
    export_cols = [c for c in export_cols if c in working.columns]
    cleaned = working[export_cols].copy()

    return cleaned

In [17]:
# ----------------- Main flow -----------------
def main():
    # 1) Scrape
    raw_df = scrape_shopify_products()

    if raw_df is None or raw_df.shape[0] == 0:
        print("No data scraped. Exiting.")
        return

    # Save raw CSV (optional)
    try:
        raw_df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8')
        print("Saved raw variants CSV:", OUTPUT_CSV)
    except Exception as e:
        print("Could not save raw CSV:", e)

    # 2) Preprocess
    cleaned = preprocess_variants_df(raw_df)

    # 3) Save cleaned CSV
    try:
        cleaned.to_csv(OUTPUT_CSV, index=False, encoding='utf-8')
        print("Saved cleaned CSV:", OUTPUT_CSV)
        print("Rows in cleaned:", len(cleaned))
    except Exception as e:
        print("Could not save cleaned CSV:", e)

    # show head for quick check
    print("\n--- Raw head ---")
    print(raw_df.head())
    print("\n--- Cleaned head ---")
    print(cleaned.head())

In [18]:
if __name__ == "__main__":
    main()

--- Starting Scrape (Shopify JSON - Detailed Variant Mode) ---
Fetching Page 1: https://gangslifestyle.com//products.json?page=1&limit=250
  Found 32 products on page 1 ...
Fetching Page 2: https://gangslifestyle.com//products.json?page=2&limit=250
  No more products found in JSON. Crawl finished.
Saved raw variants CSV: gangslifestyle.csv
Saved cleaned CSV: gangslifestyle.csv
Rows in cleaned: 282

--- Raw head ---
                Product Name                Variant Name           SKU  \
0  The VelocityTrunk Luggage           Dark Grey / Small   GANGS2036-S   
1  The VelocityTrunk Luggage          Dark Grey / Medium   GANGS2036-M   
2  The VelocityTrunk Luggage           Dark Grey / Large   GANGS2036-L   
3  The VelocityTrunk Luggage  Dark Grey / Small + Medium  GANGS2036-SM   
4  The VelocityTrunk Luggage   Dark Grey / Small + Large  GANGS2036-SL   

   In Stock?    Price Original Price Discount Info   Vendor (Brand)  \
0       True  4599.00       37999.00  Was 37999.00  GANGS Lifesty

### GangsLifeStyle

In [19]:
import os
import re
import json
import time

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Optional NLTK for sentence splitting
import nltk
from nltk.tokenize import sent_tokenize

In [20]:
# minimal nltk assets required (download if missing)
for r in ['punkt']:
    try:
        nltk.data.find(r)
    except Exception:
        try:
            nltk.download(r, quiet=True)
        except Exception:
            pass

In [21]:
# ---------- CONFIG ----------
CSV_PATH = "gangslifestyle.csv"   # <- change if needed
SIMILARITY_THRESHOLD = 0.30
TOP_K_DEFAULT = 3
OUTPUT_DIR = "/kaggle/working"

# ---------- HELPERS ----------
IRRELEVANT_KEYWORDS = [
    "gym","dumbbell","exercise","workout","recipe","food","cooking",
    "weather","news","politics","relationship","doctor","medicine",
    "math","code","python","java","cpp"
]
_irrelevant_regex = re.compile(r'\b(' + r'|'.join(re.escape(w) for w in IRRELEVANT_KEYWORDS) + r')\b', flags=re.I)

In [22]:
def is_irrelevant(query: str) -> bool:
    if not query or str(query).strip() == "":
        return True
    return bool(_irrelevant_regex.search(query))

In [23]:
def safe_jsonify(obj):
    """Convert numpy / pandas types to JSON-serializable types."""
    if isinstance(obj, dict):
        return {safe_jsonify(k): safe_jsonify(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [safe_jsonify(x) for x in obj]
    if isinstance(obj, (np.integer,)):
        return int(obj)
    if isinstance(obj, (np.floating,)):
        return float(obj)
    if isinstance(obj, (np.ndarray,)):
        return obj.tolist()
    if pd.isna(obj):
        return None
    return obj

In [24]:
def choose_text_field(df: pd.DataFrame) -> str:
    """Pick the best field to index for retrieval (prefer indexed_text_lemma, then search_content, else fallback)."""
    if 'indexed_text_lemma' in df.columns and df['indexed_text_lemma'].astype(str).str.strip().any():
        return 'indexed_text_lemma'
    if 'search_content' in df.columns and df['search_content'].astype(str).str.strip().any():
        return 'search_content'
    for candidate in ['long_description','description','title']:
        if candidate in df.columns:
            return candidate
    return df.columns[0]

In [25]:
def build_tfidf_index(df: pd.DataFrame, field: str):
    vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
    mat = vec.fit_transform(df[field].astype(str).fillna('').values)
    return vec, mat

In [26]:
def query_search_from_index(query: str,
                             df: pd.DataFrame,
                             vectorizer: TfidfVectorizer,
                             tfidf_matrix,
                             text_field: str,
                             top_k: int):
    """
    Returns a DataFrame with:
      clean_text, product_title, url, score, sku,
      price_current, price_original, discount_percent, stock_status,
      summary, long_description
    """
    q_vec = vectorizer.transform([query])
    sims = cosine_similarity(q_vec, tfidf_matrix).ravel()
    idxs = sims.argsort()[::-1][:top_k]
    rows = []
    for i in idxs:
        row = df.iloc[i]
        rows.append({
            'clean_text': row.get('search_content', '') or row.get(text_field, ''),
            'product_title': row.get('title',''),
            'url': row.get('product_url','') or row.get('url',''),
            'score': float(sims[i]),
            'sku': row.get('sku',''),
            'price_current': row.get('price_current',''),
            'price_original': row.get('price_original',''),
            'discount_percent': row.get('discount_percent',''),
            'stock_status': row.get('stock_status',''),
            'summary': row.get('summary',''),
            'long_description': row.get('long_description','')
        })
    return pd.DataFrame(rows)

In [27]:
def extractive_summary_from_retrieved(retrieved_df, top_k=TOP_K_DEFAULT):
    if retrieved_df is None or retrieved_df.empty:
        return ""
    merged = " ".join(str(x) for x in retrieved_df['clean_text'].astype(str).head(top_k).tolist())
    merged = re.sub(r'[^a-zA-Z0-9\s\.\,\-]', ' ', merged)
    # split into sentences
    sents = re.split(r'(?<=[.!?])\s+', merged)
    return " ".join(sents[:2]).strip()

In [28]:
# ---------- FOLLOW-UP DETECTION ----------
def looks_like_followup(query: str, state: dict) -> bool:
    """
    Decide if this question should use the last product context.
    We want single-word follow-ups like "Price", "Discount", "Stock", "Status"
    to automatically refer to the last product.
    """
    q = query.strip().lower()
    if not state.get("last_top_product_title"):
        return False

    # obvious conversational starters
    if q.startswith("then ") or q.startswith("what about") or q.startswith("and "):
        return True

    # generic follow-up keywords (NOW includes 'discount' and 'offer')
    follow_keywords = [
        "price", "cost", "rate",
        "discount", "offer",
        "color", "colour", "size",
        "details", "more about", "explain",
        "stock", "availability", "status"
    ]
    if any(kw in q for kw in follow_keywords) and len(q.split()) <= 7:
        return True

    # references to previous / above product
    if ("above" in q or "previous" in q or "earlier" in q) and any(
        w in q for w in ["product", "item", "one"]
    ):
        return True

    # vague references, still treat as follow-up if short
    if any(w in q for w in ["this", "that", "it", "above", "previous", "earlier"]) and len(q.split()) <= 10:
        return True

    # extra safety: if query is VERY short (1–2 words) and looks like a price/stock/discount intent, use follow-up
    if len(q.split()) <= 2 and detect_question_intent(query) in ["price", "discount", "stock", "orig_price"]:
        return True

    return False

In [29]:
# ---------- INTENT DETECTION ----------
def detect_question_intent(query: str) -> str:
    """
    Returns one of: 'discount', 'orig_price', 'price', 'stock', 'detail', 'general'
    """
    q = query.lower()
    # discount / offer
    if any(kw in q for kw in ["discount", "offer", "% off", "off "]):
        return "discount"
    # original price / mrp
    if any(kw in q for kw in ["original price", "mrp", "actual price", "before discount"]):
        return "orig_price"
    # current price / price / cost
    if any(kw in q for kw in ["price", "cost", "rate"]):
        return "price"
    # stock / availability / status
    if any(kw in q for kw in ["stock", "in stock", "out of stock", "available", "availability", "status"]):
        return "stock"
    # more details / explanation (NOTE: we only trigger on *more* / explain style, not just the word "details")
    detail_phrases = [
        "more detail", "more details", "more in detail",
        "tell me more", "explain", "explanation",
        "full description", "describe", "more info", "more about this"
    ]
    if any(p in q for p in detail_phrases):
        return "detail"
    return "general"

In [30]:
# ---------- MAIN HANDLER WITH CONTEXT ----------
def handle_query_with_context(df: pd.DataFrame,
                              query: str,
                              vectorizer: TfidfVectorizer,
                              tfidf_matrix,
                              text_field: str,
                              state: dict,
                              top_k: int = TOP_K_DEFAULT):
    """
    df: dataframe loaded from CSV
    query: user question
    vectorizer, tfidf_matrix, text_field: pre-built retrieval objects
    state: dict with conversation memory (mutated in-place)
    """
    fallback_msg = "Sorry, I couldn't answer that. I can assist you with product, website, business, or item-related queries."

    if is_irrelevant(query):
        result = {"query": query, "top_results": [], "final_answer": fallback_msg, "product_links": []}
        state["last_query"] = query
        state["last_result"] = None
        state["last_top_product_title"] = None
        return result, state

    # Decide whether to use previous context
    use_followup = looks_like_followup(query, state)

    if use_followup:
        base_title = state.get("last_top_product_title", "")
        expanded_query = f"{query.strip()} for product: {base_title}"
        search_query = expanded_query
    else:
        search_query = query

    # Retrieval
    retrieved = query_search_from_index(query=search_query,
                                        df=df,
                                        vectorizer=vectorizer,
                                        tfidf_matrix=tfidf_matrix,
                                        text_field=text_field,
                                        top_k=top_k)
    if retrieved.empty:
        result = {"query": query, "top_results": [], "final_answer": fallback_msg, "product_links": []}
        state["last_query"] = query
        state["last_result"] = retrieved
        state["last_top_product_title"] = None
        return result, state

    best_score = float(retrieved['score'].max())
    if best_score < SIMILARITY_THRESHOLD:
        result = {
            "query": query,
            "top_results": safe_jsonify(retrieved.to_dict(orient='records')),
            "final_answer": fallback_msg,
            "product_links": []
        }
        state["last_query"] = query
        state["last_result"] = retrieved
        state["last_top_product_title"] = None
        return result, state

    # ---------- SPECIAL ANSWERS ----------
    intent = detect_question_intent(query)
    top = retrieved.iloc[0]
    title = (top.get("product_title") or "").strip()

    price_current = str(top.get("price_current") or "").strip()
    price_original = str(top.get("price_original") or "").strip()
    discount_percent = str(top.get("discount_percent") or "").strip()
    stock_status = str(top.get("stock_status") or "").strip()
    summary_txt = str(top.get("summary") or "").strip()
    long_desc = str(top.get("long_description") or "").strip()

    final_answer = None

    # 1) Price-type questions
    if intent == "price":
        if price_current and price_current.lower() != "nan":
            final_answer = f"The current price of {title} is {price_current}."
            if discount_percent and discount_percent.lower() != "nan":
                final_answer += f" It currently has a discount of {discount_percent}%."
        elif price_original and price_original.lower() != "nan":
            final_answer = f"The price information of {title} is not fully available, but the original price is {price_original}."
    elif intent == "orig_price":
        if price_original and price_original.lower() != "nan":
            final_answer = f"The original price (before discount) of {title} is {price_original}."
        elif price_current and price_current.lower() != "nan":
            final_answer = f"The original price is not available, but the current price of {title} is {price_current}."
    elif intent == "discount":
        if discount_percent and discount_percent.lower() != "nan":
            final_answer = f"{title} currently has a discount of {discount_percent}%."
        elif price_original and price_current and price_original.lower() != "nan" and price_current.lower() != "nan":
            try:
                po = float(price_original)
                pc = float(price_current)
                if po > 0:
                    disc = round((po - pc) / po * 100, 1)
                    final_answer = f"{title} has an approximate discount of {disc}%."
            except Exception:
                pass
    elif intent == "stock":
        if stock_status and stock_status.lower() != "nan":
            final_answer = f"{title} is currently {stock_status}."
        else:
            final_answer = f"The stock status of {title} is not clearly available."
    # 2) Detail questions → use long_description
    elif intent == "detail":
        if long_desc and long_desc.lower() != "nan":
            final_answer = f"Here are more details about {title}: {long_desc}"
        elif summary_txt:
            final_answer = f"Here is a summary of {title}: {summary_txt}"
    # 3) General questions → use summary first
    if intent == "general" and not final_answer:
        if summary_txt and summary_txt.lower() != "nan":
            final_answer = f"{title}: {summary_txt}"
        else:
            # fallback to extractive summary
            final_answer = extractive_summary_from_retrieved(retrieved, top_k=top_k) or fallback_msg

    # Safety: if none of the above set final_answer, still fallback
    if not final_answer:
        if summary_txt:
            final_answer = f"{title}: {summary_txt}"
        else:
            final_answer = extractive_summary_from_retrieved(retrieved, top_k=top_k) or fallback_msg

    product_links = [r.get('url','') for _, r in retrieved.head(top_k).iterrows() if r.get('url')]

    result = {
        "query": query,
        "top_results": safe_jsonify(retrieved.to_dict(orient='records')),
        "final_answer": final_answer,
        "product_links": product_links
    }

    # Update conversation state
    state["last_query"] = query
    state["last_result"] = retrieved
    state["last_top_product_title"] = top.get("product_title") or ""

    return result, state

In [31]:
def save_result_as_json(result: dict, base_name: str = "query_response"):
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    ts = int(time.time())
    path = os.path.join(OUTPUT_DIR, f"{base_name}_{ts}.json")
    with open(path, "w", encoding="utf-8") as f:
        json.dump(safe_jsonify(result), f, indent=2, ensure_ascii=False)
    return path

In [32]:
# ---------- LOAD CSV + BUILD INDEX ONCE ----------
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV not found at: {CSV_PATH}")

In [None]:
df = pd.read_csv(CSV_PATH, dtype=str).fillna('')
print(f"Loaded CSV with {len(df)} rows from {CSV_PATH}")

text_field = choose_text_field(df)
print("Indexing text field:", text_field)
vectorizer, tfidf_matrix = build_tfidf_index(df, text_field)
print("TF-IDF matrix shape:", tfidf_matrix.shape)

# ---------- INTERACTIVE LOOP WITH CONTEXT ----------
conversation_state = {
    "last_query": None,
    "last_result": None,
    "last_top_product_title": None
}

print("\nType your questions. Type 'exit' or 'quit' to stop.\n")

while True:
    user_query = input("You: ").strip()
    if user_query.lower() in ["exit", "quit"]:
        print("Exiting.")
        break

    result, conversation_state = handle_query_with_context(
        df=df,
        query=user_query,
        vectorizer=vectorizer,
        tfidf_matrix=tfidf_matrix,
        text_field=text_field,
        state=conversation_state,
        top_k=TOP_K_DEFAULT
    )

    print("\nBot:", result["final_answer"])
    if result["product_links"]:
        print("Links:")
        for link in result["product_links"]:
            print(" -", link)

    save_path = save_result_as_json(result, base_name="query_response")
    print(f"(Saved response JSON to: {save_path})\n")


Loaded CSV with 282 rows from gangslifestyle.csv
Indexing text field: indexed_text_lemma
TF-IDF matrix shape: (282, 364)

Type your questions. Type 'exit' or 'quit' to stop.


Bot: Maximus Overnight Backpack - 30L: Introducing the epitome of style and functionality – the Premium Gang's Backpack.
Links:
 - https://gangslifestyle.com//products/maximus-overnight-backpack-30l?variant=49701979586872
 - https://gangslifestyle.com//products/maximus-overnight-backpack-30l?variant=49701979554104
 - https://gangslifestyle.com//products/maximus-overnight-backpack-30l?variant=49701979521336
(Saved response JSON to: /kaggle/working\query_response_1770403553.json)


Bot: The current price of Maximus Overnight Backpack - 30L is 1399.0. It currently has a discount of 53.4%.
Links:
 - https://gangslifestyle.com//products/maximus-overnight-backpack-30l?variant=49701979586872
 - https://gangslifestyle.com//products/maximus-overnight-backpack-30l?variant=49701979554104
 - https://gangslifestyle.com//produ