In [None]:
# --------------------------------- Setup and Imports --------------------------------- #
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# --------------------------------- Text Cleaning --------------------------------- #
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [None]:
# --------------------------------- Load and Preprocess Dataset --------------------------------- #
df = pd.read_csv("/content/customer_support_tickets.csv")
df = df.rename(columns={'Ticket Description': 'text', 'Ticket Type': 'label'})
df = df.dropna(subset=['text', 'label'])
df = df[df['text'].str.len() > 5]
df = df[df['text'].str.len() < df['text'].str.len().quantile(0.95)]
df['text'] = df['text'].fillna("no description").apply(clean_text)

# Simulated metadata
np.random.seed(42)
df['customer_age_days'] = np.random.randint(30, 2000, size=len(df))
df['product_category'] = np.random.choice(['Electronics', 'Clothing', 'Books'], size=len(df))
df['ticket_hour'] = np.random.randint(0, 24, size=len(df))
df['sub_label'] = df.apply(lambda row: np.random.choice(['Availability', 'Warranty', 'Specification']) if row['label'] == 'Product inquiry' else 'None', axis=1)


In [None]:
# --------------------------------- Vectorization --------------------------------- #
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(df['text'])
ohe = OneHotEncoder(sparse_output=True)
X_meta = ohe.fit_transform(df[['product_category', 'ticket_hour']])
X_combined = hstack([X_text, X_meta])
y_main = df['label']

X_train, X_test, y_train, y_test = train_test_split(X_combined, y_main, stratify=y_main, random_state=42)
main_model = LogisticRegression(max_iter=1000, class_weight='balanced')
main_model.fit(X_train, y_train)

# Sub-model for Product Inquiry
df_sub = df[df['label'] == 'Product inquiry']
X_sub = tfidf.transform(df_sub['text'])
y_sub = df_sub['sub_label']
sub_model = LogisticRegression(max_iter=1000)
sub_model.fit(X_sub, y_sub)

# Nearest Neighbor
nn = NearestNeighbors(n_neighbors=3, metric='cosine')
nn.fit(X_text)

In [None]:
# --------------------------------- Slot and Intent Configs --------------------------------- #
INTENT_SLOTS = {
    "Refund request": ["product_name", "order_id", "reason"],
    "Technical issue": ["product_name", "issue_description"],
    "Product inquiry → Warranty": ["product_name"],
    "Product inquiry → Availability": ["product_name"],
    "Product inquiry → Specification": ["product_name"]
}

AUTO_INTENT_KEYWORDS = {
    "Refund request": ["return", "refund"],
    "Technical issue": ["not working", "broken", "issue", "problem", "error"],
    "Billing inquiry": ["bill", "payment", "charged", "invoice"],
    "Cancellation request": ["cancel", "cancellation"],
    "Product inquiry": ["specification", "available", "availability", "warranty"]
}


In [None]:
# --------------------------------- Chatbot Class --------------------------------- #
class SupportChatbot:
    def __init__(self, main_model, sub_model, tfidf, ohe, df):
        self.main_model = main_model
        self.sub_model = sub_model
        self.tfidf = tfidf
        self.ohe = ohe
        self.df = df
        self.chat_history = []
        self.session = {}
        self.nn = NearestNeighbors(n_neighbors=3, metric='cosine')
        self.nn.fit(tfidf.transform(df['text'].apply(clean_text)))

    def add_message(self, role, message):
        self.chat_history.append({"role": role, "message": message})

    def predict_intent(self, text, meta):
        cleaned = clean_text(text)
        vec_text = self.tfidf.transform([cleaned])
        meta_df = pd.DataFrame([meta], columns=['product_category', 'ticket_hour'])
        vec_meta = self.ohe.transform(meta_df)
        vec_combined = hstack([vec_text, vec_meta])

        probs = self.main_model.predict_proba(vec_combined)
        confidence = probs.max()
        label = self.main_model.classes_[probs.argmax()]

        if label == 'Product inquiry':
            sub_label = self.sub_model.predict(vec_text)[0]
            return f"{label} → {sub_label}", confidence

        return label, confidence

    def fallback_intent(self, text):
        text_lower = text.lower()
        for intent, keywords in AUTO_INTENT_KEYWORDS.items():
            if any(kw in text_lower for kw in keywords):
                return intent
        return None

    def extract_slots(self, intent, text):
        slots_required = INTENT_SLOTS.get(intent, []) or INTENT_SLOTS.get(intent.split(" → ")[0], [])
        extracted = {}
        text_lower = text.lower()

        if "product_name" in slots_required:
            match = re.search(r'\b([a-zA-Z0-9 ]{2,30})\b', text_lower)
            if match and len(match.group(1).split()) <= 3:
                extracted["product_name"] = match.group(1).strip()

        if "order_id" in slots_required:
            match = re.search(r'\b\d{6,}\b', text_lower)
            if match:
                extracted["order_id"] = match.group()

        if "reason" in slots_required or "issue_description" in slots_required:
            keywords = ["not working", "broken", "damaged", "defective", "stopped working"]
            for kw in keywords:
                if kw in text_lower:
                    if "reason" in slots_required:
                        extracted["reason"] = kw
                    if "issue_description" in slots_required:
                        extracted["issue_description"] = kw
        return extracted

    def handle_user(self, user_input, meta=['Electronics', 12], threshold=0.4):
        self.add_message("user", user_input)

        if 'intent' not in self.session:
            intent, confidence = self.predict_intent(user_input, meta)
            if confidence < threshold:
                fallback = self.fallback_intent(user_input)
                if fallback:
                    intent = fallback
                else:
                    self.add_message("bot", "Sorry, I couldn't understand. Escalating to human support.")
                    return "Escalated"

            self.session = {"intent": intent, "slots": {}, "state": "collecting"}

        intent = self.session['intent']
        extracted = self.extract_slots(intent, user_input)
        self.session['slots'].update(extracted)

        main_intent_key = intent.split(" → ")[0] if "→" in intent else intent
        required_slots = INTENT_SLOTS.get(intent) or INTENT_SLOTS.get(main_intent_key, [])

        missing = [s for s in required_slots if s not in self.session['slots']]
        if missing:
            self.add_message("bot", f"Please provide: {missing[0]}")
        else:
            response = f"Your request for '{intent}' has been submitted with info: {self.session['slots']}"
            self.add_message("bot", response)
            self.session = {}

        return self.chat_history[-1]['message']

    def get_chat_history(self):
        return self.chat_history

In [None]:
# Create chatbot
bot = SupportChatbot(main_model, sub_model, tfidf, ohe, df)

# Multi-turn interaction
# bot.handle_user("Hi, I want to return my Xiaomi Fan")
# bot.handle_user("My order ID is 123456")
# bot.handle_user("It stopped working yesterday")

#------------
bot.handle_user("Hi I want refund")
bot.handle_user("TV")
bot.handle_user("Order number is 123456")
bot.handle_user("It is broken")

# Full conversation
for msg in bot.get_chat_history():
    print(f"{msg['role'].capitalize()}: {msg['message']}")


User: Hi I want refund
Bot: Please provide: product_name
User: TV
Bot: Please provide: order_id
User: Order number is 123456
Bot: Please provide: reason
User: It is broken
Bot: Your request for 'Refund request' has been submitted with info: {'product_name': 'it is broken', 'order_id': '123456', 'reason': 'broken'}


In [None]:
import pickle

pickle.dump(main_model, open("main_model.pkl", "wb"))
pickle.dump(sub_model, open("sub_model.pkl", "wb"))
pickle.dump(tfidf, open("tfidf.pkl", "wb"))
pickle.dump(ohe, open("ohe.pkl", "wb"))
pickle.dump(df, open("df.pkl", "wb"))
