# AIDI 2004 - GROUP 3

# FINAL PROJECT

ONTARIO LANDLOARD AND TENANT TRIBUNAL CHATBOT

1) Import Libraries.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import gradio as gr
import joblib

2) Load data and split dataset.

In [2]:
# Load data from CSV
data = pd.read_csv("landlord_data.csv")

# Split data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(data["instruction"], data["output"], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

3) Define Multinomial Naive Bayes model.

In [3]:
# Define a vectoriser
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

# Train the Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

4) Preprocess data.

In [4]:
# Define Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Define max_len
max_len = 100  # Adjust as needed

# Tokenize and pad sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Combine training and test sets for label encoding
combined_labels = pd.concat([y_train, y_test])

# Initialize LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(combined_labels)

# Fit label encoder and transform labels
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

num_unique_labels = len(set(combined_labels))

5) Define and train LSTM model.

In [5]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=max_len),
    LSTM(units=100),
    Dense(units=num_unique_labels, activation='softmax')  # Adjust output units based on the number of unique labels
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_pad, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test_encoded))



Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 111ms/step - accuracy: 0.0000e+00 - loss: 7.8000 - val_accuracy: 0.0000e+00 - val_loss: 7.7898
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 109ms/step - accuracy: 0.0000e+00 - loss: 7.8100 - val_accuracy: 0.0000e+00 - val_loss: 7.6898
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 111ms/step - accuracy: 0.0016 - loss: 7.7759 - val_accuracy: 0.0050 - val_loss: 8.6564
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 111ms/step - accuracy: 0.0028 - loss: 7.6570 - val_accuracy: 0.0050 - val_loss: 8.4921
Epoch 5/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 111ms/step - accuracy: 0.0025 - loss: 7.6311 - val_accuracy: 0.0050 - val_loss: 9.5063
Epoch 6/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 106ms/step - accuracy: 0.0033 - loss: 7.5848 - val_accuracy: 0.0050 - val_loss: 9.6814
Epoch 7/10


<keras.src.callbacks.history.History at 0x21c8949dac0>

6) Save local instances of the models

In [6]:
# Save the Naive Bayes classifier
joblib.dump(clf, "naive_bayes_model.joblib")

# Load the Naive Bayes classifier
clf = joblib.load("naive_bayes_model.joblib")

# Save LSTM Model
model.save("lstm_model.keras")

7) Evaluate Both model's performance

In [7]:
# Evaluate Naive Bayes model
X_val_counts = vectorizer.transform(X_val)
y_val_pred_nb = clf.predict(X_val_counts)
accuracy = accuracy_score(y_val, y_val_pred_nb)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.012578616352201259


In [8]:
# Evaluate LSTM model
X_val_pad = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_len, padding='post')
y_val_pred_lstm = model.predict(X_val_pad)
y_val_pred_lstm_classes = label_encoder.inverse_transform(y_val_pred_lstm.argmax(axis=-1))
loss, accuracy = model.evaluate(X_test_pad, y_test_encoded)
print("LSTM Model Performance:")
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.0042 - loss: 10.4107
LSTM Model Performance:
Test Loss: 10.423933982849121
Test Accuracy: 0.005025125574320555


8) Define the function to get the most similar question from the training set.

In [9]:
def get_most_similar_question(input_question):
    input_question_vector = vectorizer.transform([input_question])
    similarity = input_question_vector.dot(X_train_counts.T)
    most_similar_index = similarity.argmax()
    max_similarity = similarity[0, most_similar_index]
    if max_similarity > 0.7:  # Adjust threshold as needed
        return data.iloc[most_similar_index]["instruction"]
    else:
        return None

9) Define the Naive Bayes-based chatbot function.

In [10]:
def chatbot_nb(question):
    # Vectorize the input question using the same CountVectorizer object
    input_question_vector = vectorizer.transform([question])
    
    # Calculate the probability scores for each class (answer) using the trained Naive Bayes classifier
    probabilities = clf.predict_proba(input_question_vector)
    
    # Find the index of the class (answer) with the highest probability
    most_probable_index = np.argmax(probabilities)
    
    # Retrieve the corresponding answer from the classes
    answer_from_nb = clf.classes_[most_probable_index]
    
    # Calculate the similarity between the input question and the training questions
    similarity = input_question_vector.dot(X_train_counts.T)
    most_similar_index = similarity.argmax()
    max_similarity = similarity[0, most_similar_index]
    
    # If both the Naive Bayes prediction and similarity-based prediction agree or if the similarity is above a certain threshold, return the answer
    if (answer_from_nb == data.iloc[most_similar_index]["instruction"]) or (max_similarity > 0.7):
        return answer_from_nb
    else:
        return None

10) Define the LSTM-based chatbot function.

In [11]:
def chatbot_lstm(question):
    input_sequence = tokenizer.texts_to_sequences([question])
    input_sequence_pad = pad_sequences(input_sequence, maxlen=max_len, padding='post')
    prediction = model.predict(input_sequence_pad)
    predicted_class = label_encoder.inverse_transform([prediction.argmax()])[0]
    return predicted_class

11) Fine tune both models

In [12]:
# Fine-tune Naive Bayes model
def fine_tune_naive_bayes(X_train_new, y_train_new):
    # Load the saved Naive Bayes classifier
    clf = joblib.load("naive_bayes_model.joblib")
    
    # # Update the model with new data
    # X_train_new_counts = vectorizer.transform(X_train_new)
    clf.partial_fit(X_train_new, y_train_new, classes=np.unique(y_train_new))
    
    # Save the fine-tuned model
    joblib.dump(clf, "naive_bayes_fine.joblib")

In [13]:
# Fine-tune the Naive Bayes model with additional data
fine_tune_naive_bayes(X_train_counts, y_train)

In [14]:
def fine_tune_lstm(X_train_new, y_train_new):
    # Load the saved LSTM model
    model = tf.keras.models.load_model("lstm_model.keras")
    
    # # Preprocess the new data
    # X_train_new_seq = tokenizer.texts_to_sequences(X_train_new)
    # X_train_new_pad = pad_sequences(X_train_new_seq, maxlen=max_len, padding='post')
    
    # Fine-tune the model
    model.fit(X_train_new, y_train_new, epochs=5, batch_size=32, validation_split=0.1)
    
    # Save the fine-tuned model
    model.save("lstm_model_fine.keras")

In [15]:
# Fine-tune the LSTM model with additional data
fine_tune_lstm(X_test_pad, y_test_encoded)

Epoch 1/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 137ms/step - accuracy: 0.0043 - loss: 9.2459 - val_accuracy: 0.0000e+00 - val_loss: 6.8516
Epoch 2/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 105ms/step - accuracy: 0.0000e+00 - loss: 6.8568 - val_accuracy: 0.0000e+00 - val_loss: 7.8648
Epoch 3/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 101ms/step - accuracy: 0.0000e+00 - loss: 6.6055 - val_accuracy: 0.0000e+00 - val_loss: 8.4996
Epoch 4/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 112ms/step - accuracy: 0.0000e+00 - loss: 6.5174 - val_accuracy: 0.0000e+00 - val_loss: 9.1993
Epoch 5/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 109ms/step - accuracy: 0.0024 - loss: 6.4810 - val_accuracy: 0.0000e+00 - val_loss: 9.5804


12) Evaluate Finetuned models

In [16]:
# Evaluate Naive Bayes model
X_val_counts = vectorizer.transform(X_val)
y_val_pred_nb = clf.predict(X_val_counts)
accuracy = accuracy_score(y_val, y_val_pred_nb)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.012578616352201259


In [17]:
# Evaluate LSTM model
X_val_pad = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_len, padding='post')
y_val_pred_lstm = model.predict(X_val_pad)
y_val_pred_lstm_classes = label_encoder.inverse_transform(y_val_pred_lstm.argmax(axis=-1))
loss, accuracy = model.evaluate(X_test_pad, y_test_encoded)
print("LSTM Model Performance:")
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.0042 - loss: 10.4107
LSTM Model Performance:
Test Loss: 10.423933982849121
Test Accuracy: 0.005025125574320555


13) Define user interface.

In [18]:
# Create a Gradio interface for Naive Bayes-based chatbot
chatbot_nb_interface = gr.Interface(fn=chatbot_nb, inputs="text", outputs="text", title="Naive Bayes Chatbot")

# Create a Gradio interface for LSTM-based chatbot
chatbot_lstm_interface = gr.Interface(fn=chatbot_lstm, inputs="text", outputs="text", title="LSTM Chatbot")

In [19]:
chatbot_nb_interface.launch(share=False)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [20]:
chatbot_lstm_interface.launch(share=False)

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
