In [1]:
import os
import json
import random
import pickle
from pathlib import Path
import numpy as np
from flask import Flask, request, jsonify

import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

In [2]:
nltk_packages = ["punkt", "averaged_perceptron_tagger", "maxent_ne_chunker", "words", "stopwords"]
for pkg in nltk_packages:
    try:
        nltk.data.find(pkg)
    except Exception:
        nltk.download(pkg.split('/')[-1] if '/' in pkg else pkg)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# --- File paths ---
BASE = Path(".")
INTENTS_FILE = BASE / "intents.json"
TOKENIZER_FILE = BASE / "tokenizer.pkl"
MODEL_FILE = BASE / "intent_model.h5"

In [4]:
# --- Sample intents (will be written if intents.json missing) ---
SAMPLE_INTENTS = {
    "intents": [
        {
            "tag": "greeting",
            "patterns": ["Hi", "Hey", "Hello", "Good morning", "Good evening"],
            "responses": ["Hello! How can I help you today?", "Hi there — what can I do for you?"]
        },
        {
            "tag": "goodbye",
            "patterns": ["Bye", "See you", "Goodbye", "I am leaving"],
            "responses": ["Goodbye! Have a great day.", "See you later — take care!"]
        },
        {
            "tag": "thanks",
            "patterns": ["Thanks", "Thank you", "That's helpful", "Thanks a lot"],
            "responses": ["You're welcome!", "Happy to help!"]
        },
        {
            "tag": "hours",
            "patterns": ["What are your hours?", "When are you open?", "working hours", "open time"],
            "responses": ["We are open Monday to Friday, 9am to 6pm.", "Our hours are 9:00–18:00, Mon–Fri."]
        },
        {
            "tag": "services",
            "patterns": ["What services do you offer?", "Tell me your services", "services"],
            "responses": ["We offer A, B and C. Which one are you interested in?", "Our main services are: consulting, development, and support."]
        },
        {
            "tag": "fallback",
            "patterns": [""],
            "responses": ["Sorry, I didn't understand. Can you rephrase?", "I don't have an answer for that yet — could you try a different question?"]
        }
    ]
}

In [5]:
# --- Utilities ---

def ensure_intents_file():
    if not INTENTS_FILE.exists():
        with open(INTENTS_FILE, "w", encoding="utf-8") as f:
            json.dump(SAMPLE_INTENTS, f, indent=2)
        print(f"Created sample intents.json at {INTENTS_FILE}")


def load_intents():
    with open(INTENTS_FILE, "r", encoding="utf-8") as f:
        return json.load(f)


STOPWORDS = set(stopwords.words('english'))


def preprocess_text(text):
    # lowercase, tokenize, remove stopwords (simple)
    tokens = [t.lower() for t in word_tokenize(text) if t.isalpha()]
    tokens = [t for t in tokens if t not in STOPWORDS]
    return " ".join(tokens)

In [6]:
# --- Prepare training data for intent classification ---

def prepare_training_data(intents, num_words=2000, max_len=20):
    texts = []
    labels = []
    tag_list = []
    for intent in intents['intents']:
        tag = intent['tag']
        if tag not in tag_list:
            tag_list.append(tag)
        for patt in intent.get('patterns', []):
            texts.append(preprocess_text(patt))
            labels.append(tag_list.index(tag))

    tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    X = pad_sequences(sequences, maxlen=max_len, padding='post')
    y = tf.keras.utils.to_categorical(labels, num_classes=len(tag_list))

    return X, y, tokenizer, tag_list

In [7]:
# --- Model building and training ---

def build_model(vocab_size, embed_dim, input_length, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=input_length))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


def train_and_save(intents):
    print("Preparing training data...")
    X, y, tokenizer, tag_list = prepare_training_data(intents)
    vocab_size = 2000
    embed_dim = 50
    max_len = X.shape[1]
    model = build_model(vocab_size, embed_dim, max_len, len(tag_list))
    print("Training model (this may take a moment)...")
    model.fit(X, y, epochs=200, batch_size=8, verbose=0)
    model.save(MODEL_FILE)
    with open(TOKENIZER_FILE, 'wb') as f:
        pickle.dump({'tokenizer': tokenizer, 'max_len': max_len, 'tag_list': tag_list}, f)
    print(f"Saved model to {MODEL_FILE} and tokenizer to {TOKENIZER_FILE}")

In [8]:
# --- Prediction / NER / Response selection ---

def load_resources():
    if not MODEL_FILE.exists() or not TOKENIZER_FILE.exists():
        return None, None, None
    model = load_model(MODEL_FILE)
    with open(TOKENIZER_FILE, 'rb') as f:
        obj = pickle.load(f)
    tokenizer = obj['tokenizer']
    max_len = obj['max_len']
    tag_list = obj['tag_list']
    return model, tokenizer, max_len, tag_list


def predict_intent(model, tokenizer, max_len, text):
    seq = tokenizer.texts_to_sequences([preprocess_text(text)])
    seq = pad_sequences(seq, maxlen=max_len, padding='post')
    probs = model.predict(seq, verbose=0)[0]
    idx = int(np.argmax(probs))
    confidence = float(probs[idx])
    return idx, confidence
    
def extract_entities(text):
    # Basic NLTK NER pipeline
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    tree = ne_chunk(tags, binary=False)
    entities = []
    for subtree in tree:
        if hasattr(subtree, 'label'):
            label = subtree.label()
            name = " ".join([leaf[0] for leaf in subtree.leaves()])
            entities.append({'entity': name, 'label': label})
    return entities


def get_response(intents_data, tag, entities=None):
    for intent in intents_data['intents']:
        if intent['tag'] == tag:
            resp = random.choice(intent.get('responses', []))
            # Simple entity-aware templating: replace {entity} if present
            if entities:
                # Example: if response contains {entity_name}
                for e in entities:
                    key = "{" + e['label'].lower() + "}"
                    if key in resp:
                        resp = resp.replace(key, e['entity'])
            return resp
    # fallback
    return random.choice(next(i for i in intents_data['intents'] if i['tag']=='fallback')['responses'])



In [9]:
# --- Flask app ---
app = Flask(__name__)

# Ensure intents exist
ensure_intents_file()
intents_data = load_intents()

# Train model if missing
if not MODEL_FILE.exists() or not TOKENIZER_FILE.exists():
    train_and_save(intents_data)

# Load resources now
MODEL, TOKENIZER, MAX_LEN, TAG_LIST = load_resources()



In [None]:
@app.route('/chat', methods=['POST'])
def chat():
    data = request.json
    if not data or 'message' not in data:
        return jsonify({'error': 'Please send JSON with a "message" field.'}), 400
    message = data['message']
    # Predict intent
    idx, confidence = predict_intent(MODEL, TOKENIZER, MAX_LEN, message)
    predicted_tag = TAG_LIST[idx]
    # Extract entities
    entities = extract_entities(message)
    # Get response
    response = get_response(intents_data, predicted_tag, entities)
    return jsonify({
        'message': message,
        'intent': predicted_tag,
        'confidence': round(confidence, 3),
        'entities': entities,
        'response': response
    })

    
@app.route('/')
def index():
    return "Chatbot server running. POST JSON {\"message\": \"hello\"} to /chat"


if __name__ == '__main__':
    # Start Flask app
    app.run(debug=True, use_reloader=False)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [03/Dec/2025 18:34:26] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [03/Dec/2025 18:34:27] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
