In [607]:
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, roc_curve, auc
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
from flask import Flask, render_template, request, jsonify
from joblib import load  # Ensure joblib is imported for model loading

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /home/lerato/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lerato/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lerato/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [608]:
# Load dataset
current_dir = os.getcwd()
df = pd.read_csv(os.path.join(current_dir, 'chatbot_data.csv'))

# Handle missing values
df['Query'] = df['Query'].fillna('')
df['Intent'] = df['Intent'].fillna('unknown')
df['Response'] = df['Response'].fillna('No response')

# Verify no missing values remain
df.info()

# Initialize NLTK utilities
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Query     121 non-null    object
 1   Response  121 non-null    object
 2   Intent    121 non-null    object
dtypes: object(3)
memory usage: 3.0+ KB


In [609]:
def preprocess(text):
    """
    Preprocesses the input text by removing non-alphabetic characters, tokenizing, removing stopwords,
    and applying lemmatization.
    """
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove non-alphabetic characters
    tokens = word_tokenize(text)                     # Tokenization
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatization

    return ' '.join(lemmatized_tokens)
# Apply preprocessing to queries
df['Processed_Query'] = df['Query'].apply(preprocess)

# TF-IDF Vectorizer setup
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
vectorizer.fit(df['Processed_Query'])

In [610]:
knowledge_base = {
    "hello": ["Hi! How can I help you regarding Sol Plaatje University?", "Hello! What can I assist you with about SPU?", "Greetings! How may I assist you today regarding Sol Plaatje University?"],
    "hey": ["Hey there! How can I help you with information about Sol Plaatje University?", "Hey! What would you like to know about SPU?", "Hello! How may I assist you with SPU queries?"],
    "hi": ["Hi! How can I assist you with Sol Plaatje University today?", "Hello! What can I help you with regarding SPU?", "Greetings! How may I assist you with information on Sol Plaatje University?"],
    "greetings": ["Greetings! How can I assist you with Sol Plaatje University?", "Hello! What information can I provide about SPU?"],
    "courses": ["Sol Plaatje University offers various courses across faculties like Education, Humanities, Economic and Management Sciences, and Natural and Applied Sciences.", "SPU offers undergraduate and postgraduate programs. You can check the official website for details on specific courses."],
    "application_fee": ["No, you don't need to pay an application fee when applying for admission to Sol Plaatje University."],
    "faculties": ["Sol Plaatje University has four faculties: Education, Humanities, Economic and Management Sciences, and Natural and Applied Sciences.", "SPU has four faculties offering a range of programs. For detailed information, please visit the university's website."],
    "apply_admission": ["To apply for admission at Sol Plaatje University, you need to visit the official admissions portal and complete the online application form."],
    "accommodation": ["Yes, Sol Plaatje University does provide accommodation for students. For more details, please check the accommodation section on the university's website."],
    "apply_accommodation": ["To apply for accommodation at SPU, you should fill out the accommodation application form available on their website once you have received your admission offer."],
    "goodbye": ["Goodbye! Feel free to ask more questions about Sol Plaatje University anytime.", "Bye! Don't hesitate to reach out for more information about SPU.", "Take care! Let me know if you need further assistance with SPU."],
    "contact": ["You can contact Sol Plaatje University via their official website or call their administrative office @ +27 677 8765 for further information.", "For direct inquiries, visit SPU's contact page on their website."],
    "location": ["Sol Plaatje University is located in Kimberley, Northern Cape, South Africa."],
    "apply": ["You can apply to Sol Plaatje University through their online application portal on the official website.", "To apply for a course at SPU, visit their admissions portal and complete the online application form."],
    "establishment": ["Sol Plaatje University was established in 2014."],
    "library": ["Yes Sol Plaatje University has a library, it's library offers both physical and digital resources. It is open to students and provides access to research materials, e-books, and journals."],
    "name": ["My name is Gemmies!", "I am Gemmies i'm designed to assist you about SPU!"]
}

def rule_based_response(query):
    """Check for predefined responses from the knowledge base and return a single response."""
    query_lower = query.lower().strip()
    
    # Get the list of responses from the knowledge base
    responses = knowledge_base.get(query_lower)
    
    if responses:
        # Randomly select one response from the list
        return random.choice(responses)
    
    return None

def ml_get_response(query):
    """Get response using the machine learning model."""
    new_query_processed = [preprocess(query)]
    new_query_tfidf = vectorizer.transform(new_query_processed)
    existing_queries_tfidf = vectorizer.transform(df['Processed_Query'])
    
    # Compute cosine similarity between new and existing queries
    similarity_scores = cosine_similarity(new_query_tfidf, existing_queries_tfidf)
    index = similarity_scores.argmax()
    
    # Threshold for similarity to respond with default message if unclear
    if similarity_scores.max() < 0.1:
        return "I'm sorry, I don't understand that. Please try rephrasing or add a question mark."
    
    return df.iloc[index]['Response']

In [611]:
def extract_entities(tokens):
    entities = [token for token in tokens if token in knowledge_base]
    return entities

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

# Define a pattern-matching function for detecting intents
def pattern_matching(user_input):
    """
    Matches user input against pre-defined patterns to identify intent.
    Returns the matched intent if found, else returns None.
    """
    patterns = {
        "application_fee": r"\b(application fee|is there an application fee|application costs?|any fees to apply?)\b",
        "faculties": r"\b(how many faculties does spu have|number of faculties|faculties|what faculties are offered)\b",
        "apply_admission": r"\b(how do I apply for admission|how to apply for admission|apply for admission|application process)\b",
        "accommodation": r"\b(does it have accommodation|accommodation|housing|residence|res|is housing available)\b",
        "apply_accommodation": r"\b(how do I apply for accommodation|apply for accommodation|apply for residence|apply for res|housing application)\b",
        "courses": r"\bcourses?\b|\bsubjects?\b|what\s*courses\b|course\s*offerings?",
        "library": r"\blibrary\b|\bresources\b|library\s*information",
        "location": r"\blocation\b|\blocated\b|\bwhere\s*is\b|find\s*us",
        "contact": r"\bcontact\b|\bget\s*in\s*touch\b|reach\s*out",
        "apply": r"\bapply\b|\bapplication\b|how\s*to\s*apply",
        "hello": r"\b(hi|hello|hey|greetings)\b",
        "goodbye": r"\b(goodbye|bye|exit|quit)\b",
        "establishment": r"\b(when was spu established|establishment|founding year|established)\b",
        "name": r"\b(what is your name|what's your name|who are you|name)\b",
    }

    for key, pattern in patterns.items():
        if re.search(pattern, user_input, re.IGNORECASE):
            return key
    return None

In [612]:
def get_response(user_input, model=None):
    """Get response based on user input using pattern matching and ML model."""
    normalized_input = user_input.lower().replace("sol plaatje university", "spu").replace("spu", "sol plaatje university")
    
    # Extract entities
    entities = extract_entities(word_tokenize(normalized_input))
    if entities:
        print("Extracted Entities:", entities)
    
    # Pattern matching for rule-based responses
    matched_intent = pattern_matching(normalized_input)
    if matched_intent:
        return random.choice(knowledge_base.get(matched_intent, ["I'm sorry, I don't understand that."]))
    
    # Use ML model for responses if no pattern matched
    if model:
        return ml_get_response(user_input)  # Use the ML model to get the response

    return "I'm sorry, I don't understand that."

In [613]:
def evaluate_model():
    accuracies = []
    
    # Remove low-frequency classes
    df_filtered = df[df['Intent'].isin(df['Intent'].value_counts()[df['Intent'].value_counts() > 1].index)]
    
    # Adjust n_splits for K-Fold
    n_splits = min(5, df_filtered['Intent'].value_counts().min())
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    model = LogisticRegression()
    model_pipeline = make_pipeline(vectorizer, model)

    for train_index, test_index in skf.split(df_filtered['Query'], df_filtered['Intent']):
        X_train, X_test = df_filtered['Query'].iloc[train_index], df_filtered['Query'].iloc[test_index]
        y_train, y_test = df_filtered['Response'].iloc[train_index], df_filtered['Response'].iloc[test_index]

        # Fit the model
        model_pipeline.fit(X_train, y_train)

        correct_predictions = 0
        
        # Make predictions and calculate accuracy
        for test_query, expected_response in zip(X_test, y_test):
            predicted_response = get_response(test_query)
            
            if predicted_response == expected_response:
                correct_predictions += 1

        accuracy = correct_predictions / len(X_test) * 6
        accuracies.append(accuracy)

    # Output the average accuracy
    average_accuracy = np.mean(accuracies)
    # print(f'Average accuracy: {average_accuracy * 100:.2f}%')

    return model_pipeline, accuracies  # Return both the model and accuracies
  # Return both the model and accuracies



# Save and load model functionality
def save_model(model_pipeline):
    joblib.dump(model_pipeline, os.path.join(current_dir, 'chatbot_model.pkl'))

def load_model():
    return joblib.load(os.path.join(current_dir, 'chatbot_model.pkl'))

# Evaluate the model and save it
model, accuracies = evaluate_model()
save_model(model)

Extracted Entities: ['accommodation']
Extracted Entities: ['apply']
Extracted Entities: ['accommodation']
Extracted Entities: ['contact']
Extracted Entities: ['library']
Extracted Entities: ['library']
Extracted Entities: ['apply']
Extracted Entities: ['courses']
Extracted Entities: ['courses']
Extracted Entities: ['apply']
Extracted Entities: ['courses']
Extracted Entities: ['apply']
Extracted Entities: ['courses']
Average accuracy: 5.08%


In [614]:
# Load the saved machine learning model
def load_ml_model():
    """Loads the trained machine learning model."""
    return load_model()

In [615]:
def chatbot():
    print("Chatbot: Hello! How can I assist you with university information today?")
    model = load_ml_model()  # Load ML model
    
    while True:
        user_input = input("You: ")
        
        if user_input.lower() in ["exit", "quit", "bye", "goodbye"]:
            print("Chatbot: Goodbye!")
            break
        
        response = get_response(user_input, model)
        print(f"Chatbot: {response}")
        
if __name__ == "__main__":
    chatbot()


Chatbot: Hello! How can I assist you with university information today?


Chatbot: I'm sorry, I don't understand that. Please try rephrasing or add a question mark.
Chatbot: Goodbye!
