# Cell 1: Import required libraries

In [None]:
# Cell 1: Import required libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import random
import joblib
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download required NLTK data
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

print("All libraries imported successfully!")

# Cell 2: Data Preparation Class Definition

In [None]:
# Cell 2: Data Preparation Class Definition
"""
Data Preparation for ML Chatbot
Handles dataset loading, preprocessing, and splitting
"""

class IntentDataPreprocessor:
    def __init__(self, data_path):
        self.data_path = data_path
        self.data = None
        self.df = None
        self.label_encoder = LabelEncoder()
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
        
    def load_data(self):
        """Load intent data from JSON file"""
        with open(self.data_path, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
        return self.data
    
    def preprocess_text(self, text):
        """Clean and preprocess text data"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenize and remove stopwords
        tokens = text.split()
        tokens = [token for token in tokens if token not in self.stop_words]
        
        # Apply stemming
        tokens = [self.stemmer.stem(token) for token in tokens]
        
        return ' '.join(tokens)
    
    def create_training_data(self):
        """Create training data from intents"""
        if not self.data:
            self.load_data()
            
        patterns = []
        labels = []
        
        for intent in self.data['intents']:
            for pattern in intent['patterns']:
                # Preprocess each pattern
                processed_pattern = self.preprocess_text(pattern)
                patterns.append(processed_pattern)
                labels.append(intent['tag'])
        
        # Create DataFrame
        self.df = pd.DataFrame({
            'text': patterns,
            'label': labels
        })
        
        # Encode labels
        encoded_labels = self.label_encoder.fit_transform(labels)
        
        return patterns, encoded_labels
    
    def split_data(self, test_size=0.2, random_state=42):
        """Split data into training and test sets"""
        if self.df is None:
            self.create_training_data()
            
        X = self.df['text']
        y = self.df['label']
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )
        
        return X_train, X_test, y_train, y_test
    
    def get_label_mapping(self):
        """Get mapping between encoded labels and original tags"""
        return dict(zip(
            self.label_encoder.classes_, 
            range(len(self.label_encoder.classes_))
        ))
    
    def get_class_distribution(self):
        """Get distribution of classes in the dataset"""
        if self.df is not None:
            return self.df['label'].value_counts()
        return None

print("Data Preprocessor class defined successfully!")

# Cell 3: Example usage of Data Preprocessor

In [None]:
# Cell 3: Example usage of Data Preprocessor

# Create sample intents data for demonstration
sample_intents = {
    "intents": [
        {
            "tag": "greeting",
            "patterns": ["Hello", "Hi", "Hey", "Good morning", "Good afternoon"],
            "responses": ["Hello! How can I help you?", "Hi there!", "Greetings!"]
        },
        {
            "tag": "goodbye",
            "patterns": ["Bye", "Goodbye", "See you later", "Take care"],
            "responses": ["Goodbye!", "See you soon!", "Have a great day!"]
        },
        {
            "tag": "thanks",
            "patterns": ["Thank you", "Thanks", "Thanks a lot", "I appreciate it"],
            "responses": ["You're welcome!", "Happy to help!", "Anytime!"]
        }
    ]
}

# Save sample data to file
with open('sample_intents.json', 'w') as f:
    json.dump(sample_intents, f)

# Test the preprocessor
preprocessor = IntentDataPreprocessor('sample_intents.json')
X, y = preprocessor.create_training_data()
X_train, X_test, y_train, y_test = preprocessor.split_data()

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Number of classes: {len(preprocessor.label_encoder.classes_)}")
print("Class distribution:")
print(preprocessor.get_class_distribution())
print("\nLabel mapping:")
print(preprocessor.get_label_mapping())

# Cell 4: Model Training Class Definition

In [None]:
# Cell 4: Model Training Class Definition
"""
Model Training for Intent Classification
Trains and evaluates multiple ML algorithms
"""

class IntentClassifierTrainer:
    def __init__(self):
        self.models = {}
        self.best_model = None
        self.vectorizer = None
        self.results = {}
        
    def create_pipelines(self):
        """Create ML pipelines with different algorithms"""
        self.models = {
            'naive_bayes': Pipeline([
                ('tfidf', TfidfVectorizer(
                    max_features=5000,
                    ngram_range=(1, 2),
                    stop_words='english',
                    min_df=2,
                    max_df=0.8
                )),
                ('classifier', MultinomialNB(alpha=0.1))
            ]),
            
            'svm': Pipeline([
                ('tfidf', TfidfVectorizer(
                    max_features=5000,
                    ngram_range=(1, 2),
                    stop_words='english',
                    min_df=2,
                    max_df=0.8
                )),
                ('classifier', SVC(
                    kernel='linear',
                    C=1.0,
                    probability=True,
                    random_state=42
                ))
            ]),
            
            'logistic_regression': Pipeline([
                ('tfidf', TfidfVectorizer(
                    max_features=5000,
                    ngram_range=(1, 2),
                    stop_words='english',
                    min_df=2,
                    max_df=0.8
                )),
                ('classifier', LogisticRegression(
                    C=1.0,
                    max_iter=1000,
                    random_state=42,
                    multi_class='ovr'
                ))
            ]),
            
            'random_forest': Pipeline([
                ('tfidf', TfidfVectorizer(
                    max_features=5000,
                    ngram_range=(1, 2),
                    stop_words='english',
                    min_df=2,
                    max_df=0.8
                )),
                ('classifier', RandomForestClassifier(
                    n_estimators=100,
                    random_state=42,
                    max_depth=10
                ))
            ])
        }
    
    def train_models(self, X_train, y_train):
        """Train all models and measure training time"""
        self.create_pipelines()
        self.results = {}
        
        for name, model in self.models.items():
            print(f"Training {name}...")
            start_time = time.time()
            
            model.fit(X_train, y_train)
            
            training_time = time.time() - start_time
            self.results[name] = {
                'model': model,
                'training_time': training_time
            }
            
            print(f"  {name} trained in {training_time:.2f} seconds")
    
    def evaluate_models(self, X_test, y_test):
        """Evaluate all models on test data"""
        for name in self.models.keys():
            model = self.results[name]['model']
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            
            self.results[name]['accuracy'] = accuracy
            self.results[name]['predictions'] = y_pred
            
            print(f"{name.upper():<20} Accuracy: {accuracy:.4f}")
    
    def get_best_model(self):
        """Select the best performing model"""
        best_accuracy = 0
        best_model_name = None
        
        for name, result in self.results.items():
            if result['accuracy'] > best_accuracy:
                best_accuracy = result['accuracy']
                best_model_name = name
        
        if best_model_name:
            self.best_model = self.results[best_model_name]['model']
            print(f"\nBest model: {best_model_name} with accuracy: {best_accuracy:.4f}")
            return self.best_model
        
        return None
    
    def detailed_classification_report(self, X_test, y_test, label_encoder):
        """Generate detailed classification report for best model"""
        if self.best_model:
            y_pred = self.best_model.predict(X_test)
            
            # Convert encoded labels back to original names
            y_test_labels = label_encoder.inverse_transform(y_test)
            y_pred_labels = label_encoder.inverse_transform(y_pred)
            
            print("\nDetailed Classification Report:")
            print(classification_report(y_test_labels, y_pred_labels))
            
            return classification_report(y_test_labels, y_pred_labels, output_dict=True)
        return None
    
    def plot_confusion_matrix(self, X_test, y_test, label_encoder, figsize=(12, 10)):
        """Plot confusion matrix for best model"""
        if self.best_model:
            y_pred = self.best_model.predict(X_test)
            y_test_labels = label_encoder.inverse_transform(y_test)
            y_pred_labels = label_encoder.inverse_transform(y_pred)
            
            cm = confusion_matrix(y_test_labels, y_pred_labels, 
                                labels=label_encoder.classes_)
            
            plt.figure(figsize=figsize)
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                       xticklabels=label_encoder.classes_,
                       yticklabels=label_encoder.classes_)
            plt.title('Confusion Matrix')
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
            plt.xticks(rotation=45)
            plt.yticks(rotation=0)
            plt.tight_layout()
            plt.show()
    
    def save_model(self, filepath):
        """Save the best model to disk"""
        if self.best_model:
            joblib.dump(self.best_model, filepath)
            print(f"Model saved to {filepath}")
        else:
            print("No model to save. Train a model first.")
    
    def save_vectorizer(self, filepath):
        """Save the TF-IDF vectorizer to disk"""
        if self.best_model:
            vectorizer = self.best_model.named_steps['tfidf']
            joblib.dump(vectorizer, filepath)
            print(f"Vectorizer saved to {filepath}")

print("Model Trainer class defined successfully!")

# Cell 5: Example usage of Model Trainer

In [None]:
# Cell 5: Example usage of Model Trainer
# Using the sample data from previous cell

# Train models
trainer = IntentClassifierTrainer()
trainer.train_models(X_train, y_train)
trainer.evaluate_models(X_test, y_test)
trainer.get_best_model()

# Generate detailed reports
trainer.detailed_classification_report(X_test, y_test, preprocessor.label_encoder)
trainer.plot_confusion_matrix(X_test, y_test, preprocessor.label_encoder)

# Save the best model
trainer.save_model('best_intent_classifier.joblib')
trainer.save_vectorizer('tfidf_vectorizer.joblib')

# Cell 6: Chatbot Application Class Definition

In [None]:
# Cell 6: Chatbot Application Class Definition
"""
ML-Powered Chatbot
Main chatbot application using trained ML model
"""

class MLChatbot:
    def __init__(self, intents_file, model_file=None, vectorizer_file=None):
        self.intents_file = intents_file
        self.model_file = model_file
        self.vectorizer_file = vectorizer_file
        
        # Load intents data
        self.preprocessor = IntentDataPreprocessor(intents_file)
        self.intents_data = self.preprocessor.load_data()
        self.intents = self.intents_data['intents']
        
        # Initialize model and vectorizer
        self.model = None
        self.vectorizer = None
        self.label_encoder = None
        
        # Context tracking
        self.context = {}
        self.conversation_history = []
        
        # Load or train model
        if model_file and vectorizer_file:
            self.load_model(model_file, vectorizer_file)
        else:
            self.train_model()
    
    def train_model(self):
        """Train the intent classification model"""
        print("Training ML model...")
        
        # Prepare data
        X, y = self.preprocessor.create_training_data()
        X_train, X_test, y_train, y_test = self.preprocessor.split_data()
        self.label_encoder = self.preprocessor.label_encoder
        
        # Train models
        trainer = IntentClassifierTrainer()
        trainer.train_models(X_train, y_train)
        trainer.evaluate_models(X_test, y_test)
        trainer.get_best_model()
        
        self.model = trainer.best_model
        self.vectorizer = self.model.named_steps['tfidf']
        
        print("Model training completed!")
    
    def load_model(self, model_file, vectorizer_file):
        """Load pre-trained model and vectorizer"""
        try:
            self.model = joblib.load(model_file)
            self.vectorizer = joblib.load(vectorizer_file)
            self.label_encoder = self.preprocessor.label_encoder
            print("Model loaded successfully!")
        except Exception as e:
            print(f"Error loading model: {e}")
            print("Training new model instead...")
            self.train_model()
    
    def preprocess_input(self, text):
        """Preprocess user input using the same method as training"""
        return self.preprocessor.preprocess_text(text)
    
    def predict_intent(self, user_input):
        """Predict intent from user input"""
        if not self.model:
            raise ValueError("Model not loaded or trained")
        
        # Preprocess input
        processed_input = self.preprocess_input(user_input)
        
        # Predict intent
        prediction = self.model.predict([processed_input])[0]
        confidence = np.max(self.model.predict_proba([processed_input]))
        
        # Convert back to intent tag
        intent_tag = self.label_encoder.inverse_transform([prediction])[0]
        
        return intent_tag, confidence, processed_input
    
    def get_response(self, intent_tag, confidence_threshold=0.6):
        """Get response for predicted intent"""
        if confidence_threshold and confidence < confidence_threshold:
            return self.get_fallback_response()
        
        for intent in self.intents:
            if intent['tag'] == intent_tag:
                response = random.choice(intent['responses'])
                return response
        
        return self.get_fallback_response()
    
    def get_fallback_response(self):
        """Get response when intent is not recognized"""
        fallback_responses = [
            "I'm not sure I understand. Could you rephrase that?",
            "That's interesting! Could you tell me more?",
            "I'm still learning. Could you try asking in a different way?",
            "I want to make sure I understand correctly. Could you elaborate?",
            "That's outside my current knowledge. Maybe ask me something else?",
            "I'm designed to help with various topics. Could you try rephrasing?",
            "I appreciate your message! Could you provide more context?",
            "I'm here to assist you. Could you clarify what you mean?",
            "That's given me something to think about! Want to try another topic?",
            "I'm constantly learning. Could you ask me something different?"
        ]
        return random.choice(fallback_responses)
    
    def update_context(self, user_input, intent_tag, response):
        """Update conversation context"""
        self.conversation_history.append({
            'user_input': user_input,
            'intent': intent_tag,
            'response': response,
            'timestamp': np.datetime64('now')
        })
        
        # Keep only last 10 messages
        if len(self.conversation_history) > 10:
            self.conversation_history.pop(0)
    
    def chat(self):
        """Main chat loop"""
        print("🤖 ML-Powered Chatbot: Hello! I'm now using Machine Learning!")
        print("💡 I can understand your intent and respond appropriately")
        print("💬 Type 'quit' to end our conversation\n")
        
        while True:
            try:
                user_input = input("You: ").strip()
                
                if not user_input:
                    print("Bot: I notice you didn't type anything. Is everything okay?")
                    continue
                
                if user_input.lower() in ['quit', 'exit', 'bye', 'goodbye']:
                    print("Bot: Thank you for chatting! I'm learning from every conversation!")
                    break
                
                # Predict intent and get response
                intent_tag, confidence, processed_input = self.predict_intent(user_input)
                response = self.get_response(intent_tag)
                
                # Update context
                self.update_context(user_input, intent_tag, response)
                
                # Display response with confidence (for educational purposes)
                print(f"Bot: {response}")
                print(f"    [Detected: {intent_tag} | Confidence: {confidence:.2f}]")
                
            except KeyboardInterrupt:
                print("\n\nBot: Thanks for the conversation! Come back anytime!")
                break
            except Exception as e:
                print(f"Bot: I encountered an error: {str(e)}")
                print("Let's continue our conversation!")
    
    def evaluate_on_test_set(self):
        """Evaluate model performance on test set"""
        from sklearn.metrics import accuracy_score, classification_report
        
        X_train, X_test, y_train, y_test = self.preprocessor.split_data()
        y_pred = self.model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        y_test_labels = self.label_encoder.inverse_transform(y_test)
        y_pred_labels = self.label_encoder.inverse_transform(y_pred)
        
        print(f"Model Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test_labels, y_pred_labels))
        
        return accuracy

print("ML Chatbot class defined successfully!")

# Cell 7: Example usage of the Chatbot

In [None]:
# Cell 7: Example usage of the Chatbot
# Initialize chatbot with our sample data and trained model
chatbot = MLChatbot(
    intents_file='sample_intents.json',
    model_file='best_intent_classifier.joblib',
    vectorizer_file='tfidf_vectorizer.joblib'
)

# Test the chatbot with some sample inputs
test_inputs = [
    "Hello there!",
    "Thank you for your help",
    "Goodbye for now",
    "What's the weather like?"  # This should trigger fallback response
]

print("Testing chatbot with sample inputs:\n")
for input_text in test_inputs:
    print(f"You: {input_text}")
    intent_tag, confidence, processed_input = chatbot.predict_intent(input_text)
    response = chatbot.get_response(intent_tag)
    print(f"Bot: {response}")
    print(f"    [Detected: {intent_tag} | Confidence: {confidence:.2f}]\n")

# Cell 8: Interactive Chat Session

In [None]:
# Cell 8: Interactive Chat Session
# Uncomment the following line to start an interactive chat session
# chatbot.chat()

# Cell 9: Model Evaluation

In [None]:
# Cell 9: Model Evaluation
# Evaluate the model performance on the test set
print("Evaluating model performance...")
accuracy = chatbot.evaluate_on_test_set()
print(f"\nOverall Model Accuracy: {accuracy:.4f}")