In [None]:
# Chattea Chatbot - TF-IDF + Cosine Similarity (From Scratch)
# Educational AI Model for Customer Support

import json
import math
import re
from collections import Counter, defaultdict

In [None]:
# ============================================================================
# SECTION 1: PREPROCESSING
# ============================================================================

def preprocess_text(text):
    """
    Clean and tokenize text
    Args:
        text (str): Raw input text
    Returns:
        list: List of tokens (words)
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and special characters, keep spaces
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize (split by whitespace)
    tokens = text.split()
    
    # Remove single character tokens (optional)
    tokens = [token for token in tokens if len(token) > 1]
    
    return tokens

In [None]:
# ============================================================================
# SECTION 2: FUZZY MATCHING (LEVENSHTEIN DISTANCE)
# ============================================================================

def levenshtein_distance(s1, s2):
    """
    Calculate edit distance between two strings
    Args:
        s1, s2 (str): Two strings to compare
    Returns:
        int: Minimum number of edits needed
    """
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    
    if len(s2) == 0:
        return len(s1)
    
    previous_row = range(len(s2) + 1)
    
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            # Cost of insertions, deletions, substitutions
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]


def fuzzy_match(word, candidates, threshold=2):
    """
    Find closest match for a word from candidates
    Args:
        word (str): Word to match
        candidates (list): List of possible matches
        threshold (int): Maximum edit distance allowed
    Returns:
        str or None: Best match or None
    """
    best_match = None
    best_distance = float('inf')
    
    for candidate in candidates:
        distance = levenshtein_distance(word.lower(), candidate.lower())
        if distance < best_distance and distance <= threshold:
            best_distance = distance
            best_match = candidate
    
    return best_match

In [None]:
# ============================================================================
# SECTION 3: TF-IDF FROM SCRATCH
# ============================================================================

class TFIDFVectorizer:
    """
    TF-IDF Vectorizer built from scratch
    """
    
    def __init__(self):
        self.vocabulary = {}  # word -> index mapping
        self.idf = {}  # word -> IDF score
        self.documents = []
        
    def fit(self, documents):
        """
        Learn vocabulary and IDF scores from documents
        Args:
            documents (list): List of tokenized documents (list of lists)
        """
        self.documents = documents
        n_documents = len(documents)
        
        # Build vocabulary
        all_words = set()
        for doc in documents:
            all_words.update(doc)
        
        self.vocabulary = {word: idx for idx, word in enumerate(sorted(all_words))}
        
        # Calculate IDF for each word
        # IDF(word) = log(N / df(word))
        # where N = total documents, df = document frequency
        
        document_frequency = Counter()
        for doc in documents:
            unique_words = set(doc)
            for word in unique_words:
                document_frequency[word] += 1
        
        for word in self.vocabulary:
            df = document_frequency[word]
            # Add smoothing to avoid division by zero
            self.idf[word] = math.log((n_documents + 1) / (df + 1)) + 1
    
    def transform(self, documents):
        """
        Transform documents to TF-IDF vectors
        Args:
            documents (list): List of tokenized documents
        Returns:
            list: List of TF-IDF vectors (dictionaries)
        """
        vectors = []
        
        for doc in documents:
            vector = defaultdict(float)
            doc_length = len(doc)
            
            if doc_length == 0:
                vectors.append(vector)
                continue
            
            # Calculate term frequency for this document
            term_freq = Counter(doc)
            
            # Calculate TF-IDF for each term
            for word, count in term_freq.items():
                if word in self.vocabulary:
                    # TF = count / total_words_in_doc
                    tf = count / doc_length
                    # TF-IDF = TF * IDF
                    vector[word] = tf * self.idf[word]
            
            vectors.append(vector)
        
        return vectors
    
    def fit_transform(self, documents):
        """
        Fit and transform in one step
        """
        self.fit(documents)
        return self.transform(documents)


In [None]:
# ============================================================================
# SECTION 4: COSINE SIMILARITY
# ============================================================================

def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors
    Args:
        vec1, vec2 (dict): TF-IDF vectors as dictionaries
    Returns:
        float: Similarity score between 0 and 1
    """
    # Get common words
    common_words = set(vec1.keys()) & set(vec2.keys())
    
    if not common_words:
        return 0.0
    
    # Calculate dot product
    dot_product = sum(vec1[word] * vec2[word] for word in common_words)
    
    # Calculate magnitudes
    magnitude1 = math.sqrt(sum(val ** 2 for val in vec1.values()))
    magnitude2 = math.sqrt(sum(val ** 2 for val in vec2.values()))
    
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0
    
    # Cosine similarity
    return dot_product / (magnitude1 * magnitude2)

In [None]:

# ============================================================================
# SECTION 5: CHATBOT CLASS
# ============================================================================

class ChatteaChatbot:
    """
    Main chatbot class using TF-IDF and Cosine Similarity
    """
    
    def __init__(self, intents_data, responses_data, confidence_threshold=0.3):
        """
        Initialize chatbot
        Args:
            intents_data (list): List of dicts with 'intent' and 'text' examples
            responses_data (dict): Intent -> response mapping
            confidence_threshold (float): Minimum similarity score to accept match
        """
        self.intents_data = intents_data
        self.responses_data = responses_data
        self.confidence_threshold = confidence_threshold
        self.vectorizer = TFIDFVectorizer()
        
        # Key terms for fuzzy matching (Chattea features)
        self.key_terms = [
            'blast', 'message', 'schedule', 'filter', 'whatsapp', 
            'number', 'panasin', 'chatbot', 'help', 'kirim', 'pesan'
        ]
        
        # Prepare training data
        self._prepare_data()
    
    def _prepare_data(self):
        """
        Prepare and vectorize training data
        """
        # Extract intents and their example texts
        self.intents = []
        self.intent_texts = []
        
        for item in self.intents_data:
            intent = item['intent']
            text = item['text']
            
            self.intents.append(intent)
            self.intent_texts.append(text)
        
        # Preprocess all training texts
        tokenized_texts = [preprocess_text(text) for text in self.intent_texts]
        
        # Fit TF-IDF vectorizer
        self.intent_vectors = self.vectorizer.fit_transform(tokenized_texts)
        
        print(f"âœ“ Trained on {len(self.intents)} intent examples")
        print(f"âœ“ Vocabulary size: {len(self.vectorizer.vocabulary)}")
    
    def _apply_fuzzy_matching(self, tokens):
        """
        Apply fuzzy matching to tokens to catch typos
        """
        corrected_tokens = []
        for token in tokens:
            match = fuzzy_match(token, self.key_terms, threshold=2)
            corrected_tokens.append(match if match else token)
        return corrected_tokens
    
    def predict(self, user_input, top_k=3):
        """
        Predict intent for user input
        Args:
            user_input (str): User's message
            top_k (int): Return top K matches
        Returns:
            list: List of tuples (intent, confidence, response)
        """
        # Preprocess user input
        tokens = preprocess_text(user_input)
        
        # Apply fuzzy matching
        tokens = self._apply_fuzzy_matching(tokens)
        
        # Vectorize user input
        user_vector = self.vectorizer.transform([tokens])[0]
        
        # Calculate similarity with all intents
        similarities = []
        for i, intent_vector in enumerate(self.intent_vectors):
            similarity = cosine_similarity(user_vector, intent_vector)
            similarities.append((self.intents[i], similarity))
        
        # Sort by similarity (highest first)
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Get top K results with responses
        results = []
        for intent, confidence in similarities[:top_k]:
            if confidence >= self.confidence_threshold:
                response = self.responses_data.get(intent, "Maaf, saya tidak mengerti.")
                results.append((intent, confidence, response))
        
        # If no confident match, return fallback
        if not results:
            return [("unknown", 0.0, "Maaf, saya tidak mengerti pertanyaan Anda. Bisa tolong diperjelas?")]
        
        return results
    
    def chat(self, user_input):
        """
        Get chatbot response for user input
        Args:
            user_input (str): User's message
        Returns:
            str: Bot's response
        """
        results = self.predict(user_input, top_k=1)
        intent, confidence, response = results[0]
        
        print(f"\n[DEBUG] Detected Intent: {intent} (confidence: {confidence:.3f})")
        
        return response


In [None]:
# ============================================================================
# SECTION 6: DEMO & TESTING
# ============================================================================

# Sample dataset for Chattea
intents_dataset = [
    # Blast message intents
    {"intent": "blast_message", "text": "cara blast message"},
    {"intent": "blast_message", "text": "kirim pesan massal"},
    {"intent": "blast_message", "text": "bagaimana mengirim blast"},
    {"intent": "blast_message", "text": "blast pesan ke banyak orang"},
    
    # Schedule message intents
    {"intent": "schedule_message", "text": "jadwalkan pesan"},
    {"intent": "schedule_message", "text": "cara schedule message"},
    {"intent": "schedule_message", "text": "kirim pesan terjadwal"},
    {"intent": "schedule_message", "text": "atur waktu kirim pesan"},
    
    # Filter WhatsApp number
    {"intent": "filter_number", "text": "filter nomor whatsapp"},
    {"intent": "filter_number", "text": "cek nomor terdaftar"},
    {"intent": "filter_number", "text": "validasi nomor whatsapp"},
    {"intent": "filter_number", "text": "nomor aktif whatsapp"},
    
    # Panasin WhatsApp
    {"intent": "panasin_wa", "text": "panasin whatsapp"},
    {"intent": "panasin_wa", "text": "cara panasin wa"},
    {"intent": "panasin_wa", "text": "hindari blokir whatsapp"},
    {"intent": "panasin_wa", "text": "warming up whatsapp"},
    
    # General help
    {"intent": "help", "text": "bantuan"},
    {"intent": "help", "text": "tolong saya"},
    {"intent": "help", "text": "help"},
    {"intent": "help", "text": "apa yang bisa dilakukan"},
    
    # Greeting
    {"intent": "greeting", "text": "halo"},
    {"intent": "greeting", "text": "hi"},
    {"intent": "greeting", "text": "hai"},
    {"intent": "greeting", "text": "selamat pagi"},
]

responses_dataset = {
    "blast_message": "Untuk blast message, buka menu 'Blast' â†’ Pilih kontak â†’ Tulis pesan â†’ Klik 'Kirim'. Anda bisa mengirim hingga 1000 pesan sekaligus!",
    "schedule_message": "Untuk schedule message, buka 'Schedule' â†’ Pilih kontak â†’ Tulis pesan â†’ Atur tanggal & waktu â†’ Simpan. Pesan akan terkirim otomatis!",
    "filter_number": "Fitur Filter Number membantu Anda memeriksa nomor mana yang terdaftar di WhatsApp. Buka 'Filter' â†’ Upload daftar nomor â†’ Sistem akan validasi secara otomatis.",
    "panasin_wa": "Panasin WhatsApp adalah fitur untuk mengurangi risiko banned. Sistem membuat beberapa instance chat saling berkomunikasi. Aktifkan di menu 'Settings' â†’ 'Warming Up'.",
    "help": "Saya bisa bantu Anda dengan:\n1. Blast Message\n2. Schedule Message\n3. Filter WhatsApp Number\n4. Panasin WhatsApp\n\nSilakan tanyakan fitur yang ingin Anda pelajari!",
    "greeting": "Halo! Saya Chattea Bot. Saya siap membantu Anda menggunakan aplikasi Chattea. Ada yang bisa saya bantu?",
}

# Initialize chatbot
print("=" * 60)
print("CHATTEA CHATBOT - TF-IDF FROM SCRATCH")
print("=" * 60)

bot = ChatteaChatbot(intents_dataset, responses_dataset, confidence_threshold=0.25)

# Test cases (including typos!)
test_inputs = [
    "cara blst message",  # typo: blst -> blast
    "jadwalkan pesan saya",
    "gimana filter nomor wa",
    "pnasin whatsapp",  # typo: pnasin -> panasin
    "tolong bantu saya",
    "halo bot",
    "kirim pesam massal",  # typo: pesam -> pesan
    "schedule mesage ke customer",  # typo: mesage -> message
]

print("\n" + "=" * 60)
print("TESTING CHATBOT WITH TYPOS")
print("=" * 60)

for test_input in test_inputs:
    print(f"\nðŸ‘¤ User: {test_input}")
    response = bot.chat(test_input)
    print(f"ðŸ¤– Bot: {response}")


In [None]:

# print("\n" + "=" * 60)
# print("INTERACTIVE MODE (type 'quit' to exit)")
# print("=" * 60)

# while True:
#     user_input = input("\nðŸ‘¤ You: ")
#     if user_input.lower() in ['quit', 'exit', 'keluar']:
#         print("ðŸ‘‹ Terima kasih! Sampai jumpa!")
#         break
    
#     response = bot.chat(user_input)
#     print(f"ðŸ¤– Bot: {response}")