In [None]:
import pandas as pd
import random
import numpy as np

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


# Import Data

In [None]:
df = pd.read_excel('/workspaces/Willy_Code_Chatbot/mafupe_mokone_life_experiences.xlsx')
#

In [None]:
df.head()

Unnamed: 0,Question,Answer
0,What inspired you to become a good student?,I aimed to overcome criticism and prove to mys...
1,What made you switch schools?,I switched schools to take my academic goals m...
2,What challenges did you face in high school?,I struggled with self-doubt and facing critici...
3,How did you excel in mathematics after struggl...,"I worked hard, especially during term 3 of gra..."
4,What is your experience as a sports and recrea...,As Sports and Recreation Officer for ASABA NWU...


In [None]:
print(df.columns)


Index(['Question', 'Answer'], dtype='object')


In [None]:
# Standardize column names to lowercase
df.columns = [col.lower() for col in df.columns]


# Preprocessing Tex

In [None]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')  # Add this to resolve the punkt_tab issue

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenization
    words = word_tokenize(text)

    # Remove stopwords, punctuation, and numbers. Keep only alphabetic characters.
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]

    # Join the words back into a string
    return " ".join(words)



# Check and standardize column names
df.columns = [col.lower() for col in df.columns]
print(df.columns)

# Apply the preprocessing function to your 'question' and 'answer' columns
df['question_processed'] = df['question'].apply(preprocess_text)
df['answer_processed'] = df['answer'].apply(preprocess_text)

# Display the processed data
print(df[['question', 'answer', 'question_processed', 'answer_processed']].head())


[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Index(['question', 'answer'], dtype='object')
                                            question  \
0        What inspired you to become a good student?   
1                      What made you switch schools?   
2       What challenges did you face in high school?   
3  How did you excel in mathematics after struggl...   
4  What is your experience as a sports and recrea...   

                                              answer  \
0  I aimed to overcome criticism and prove to mys...   
1  I switched schools to take my academic goals m...   
2  I struggled with self-doubt and facing critici...   
3  I worked hard, especially during term 3 of gra...   
4  As Sports and Recreation Officer for ASABA NWU...   

                          question_processed  \
0               inspired become good student   
1                         made switch school   
2                 challenge face high school   
3  excel mathematics struggling earlier year   
4        experience sport recreation off

# Initializing Training Data

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Combine all questions and answers for fitting tokenizer
texts = df['question_processed'].tolist() + df['answer_processed'].tolist()

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

# Convert to sequences
question_seq = tokenizer.texts_to_sequences(df['question_processed'])
answer_seq = tokenizer.texts_to_sequences(df['answer_processed'])

# Pad sequences
maxlen = 30  # adjust based on your data
question_pad = pad_sequences(question_seq, maxlen=maxlen, padding='post')
answer_pad = pad_sequences(answer_seq, maxlen=maxlen, padding='post')


In [None]:
import numpy as np

# Positive pairs
X_question = question_pad
X_answer = answer_pad
y = np.ones(len(df))

# Generate negative (mismatched) pairs
import random

Xq_neg = []
Xa_neg = []
for i in range(len(df)):
    wrong_idx = random.randint(0, len(df)-1)
    if df['answer_processed'][i] != df['answer_processed'][wrong_idx]:
        Xq_neg.append(question_pad[i])
        Xa_neg.append(answer_pad[wrong_idx])
        
# Add to training data with label 0
X_question = np.concatenate([X_question, np.array(Xq_neg)])
X_answer = np.concatenate([X_answer, np.array(Xa_neg)])
y = np.concatenate([y, np.zeros(len(Xq_neg))])


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 64

# Inputs
q_input = Input(shape=(maxlen,))
a_input = Input(shape=(maxlen,))

# Shared Embedding + LSTM
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=maxlen)
lstm_layer = LSTM(64)

q_embed = lstm_layer(embedding_layer(q_input))
a_embed = lstm_layer(embedding_layer(a_input))

# Merge & Dense
merged = Concatenate()([q_embed, a_embed])
dense = Dense(64, activation='relu')(merged)
output = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=[q_input, a_input], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


2025-04-04 10:15:32.936846: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [None]:
model.fit([X_question, X_answer], y, batch_size=32, epochs=5, validation_split=0.1)


Epoch 1/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 431ms/step - accuracy: 0.5100 - loss: 0.6926 - val_accuracy: 0.0000e+00 - val_loss: 0.7297
Epoch 2/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.5789 - loss: 0.6880 - val_accuracy: 0.0000e+00 - val_loss: 0.7661
Epoch 3/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.5685 - loss: 0.6855 - val_accuracy: 0.0000e+00 - val_loss: 0.8119
Epoch 4/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.5789 - loss: 0.6821 - val_accuracy: 0.0000e+00 - val_loss: 0.8628
Epoch 5/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.5476 - loss: 0.6906 - val_accuracy: 0.0000e+00 - val_loss: 0.8742


<keras.src.callbacks.history.History at 0x7d3daf6735c0>

In [None]:
def predict_match(q, a):
    q_seq = pad_sequences(tokenizer.texts_to_sequences([preprocess_text(q)]), maxlen=maxlen)
    a_seq = pad_sequences(tokenizer.texts_to_sequences([preprocess_text(a)]), maxlen=maxlen)
    score = model.predict([q_seq, a_seq])[0][0]
    return score

# Try it!
print(predict_match("Why do you love travel?", "I always loved exploring cultures."))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
0.56435984


In [None]:
# Save the model after inference
def save_model_after_inference(model, model_filename="faq_model_after_inference.h5"):
    model.save(model_filename)
    print(f"Model saved as {model_filename}")


In [None]:
# Call this after inference
save_model_after_inference(model)




Model saved as faq_model_after_inference.h5
