**Data preprocessing**

In [1]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/topical_chat.csv')

In [2]:
data

Unnamed: 0,conversation_id,message,sentiment
0,1,Are you a fan of Google or Microsoft?,Curious to dive deeper
1,1,Both are excellent technology they are helpfu...,Curious to dive deeper
2,1,"I'm not a huge fan of Google, but I use it a...",Curious to dive deeper
3,1,Google provides online related services and p...,Curious to dive deeper
4,1,"Yeah, their services are good. I'm just not a...",Curious to dive deeper
...,...,...,...
188373,8628,"Wow, it does not seem like that long. Since I...",Surprised
188374,8628,"I havent seen that episode, I might google it...",Curious to dive deeper
188375,8628,I don't think I have either. That's an insane...,Curious to dive deeper
188376,8628,"I did, my little brother used to love Thomas ...",Happy


In [3]:
# Group messages by conversation ID
grouped_data = data.groupby('conversation_id')['message'].apply(list)

# Prepare pairs of questions and answers for each conversation
conversations = []
for _, messages in grouped_data.items():
    questions = messages[::2]  # Assuming questions are at even indices
    answers = messages[1::2]   # Assuming answers are at odd indices

    for question, answer in zip(questions, answers):
        conversations.append({'question': question, 'answer': answer})

# Print the first few conversations for verification
for i, conv in enumerate(conversations):
    print(f"Conversation {i + 1}:")
    print(f"Question: {conv['question']}")
    print(f"Answer: {conv['answer']}\n")
    if i >= 40:
        break

Conversation 1:
Question:  Are you a fan of Google or Microsoft?
Answer:  Both are excellent technology they are helpful in many ways. For the security purpose both are super.

Conversation 2:
Question:  I'm not  a huge fan of Google, but I use it a lot because I have to. I think they are a monopoly in some sense. 
Answer:  Google provides online related services and products, which includes online ads, search engine and cloud computing.

Conversation 3:
Question:  Yeah, their services are good. I'm just not a fan of intrusive they can be on our personal lives. 
Answer:  Google is leading the alphabet subsidiary and will continue to be the Umbrella company for Alphabet internet interest.

Conversation 4:
Question:  Did you know Google had hundreds of live goats to cut the grass in the past? 
Answer:  It is very interesting. Google provide "Chrome OS" which is a light weight OS. Google provided a lot of hardware mainly in 2010 to 2015. 

Conversation 5:
Question:  I like Google Chrome. 

In [4]:
import nltk
from nltk.tokenize import word_tokenize
import string

# Download necessary resources if not already downloaded
nltk.download('punkt')

# Define a function to clean and preprocess the text
def clean_and_preprocess(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = remove_punctuation(text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Join tokens back into a cleaned sentence
    cleaned_text = ' '.join(tokens)

    return cleaned_text

def remove_punctuation(text):
    # Create a translation table to remove punctuation
    translator = str.maketrans('', '', string.punctuation)

    # Remove punctuation using the translation table
    text_without_punctuation = text.translate(translator)

    return text_without_punctuation

# Clean and preprocess the questions and answers in conversations
cleaned_conversations = []
for conv in conversations:
    cleaned_question = clean_and_preprocess(conv['question'])
    cleaned_answer = clean_and_preprocess(conv['answer'])
    cleaned_conversations.append({'question': cleaned_question, 'answer': cleaned_answer})

# Print the first few cleaned conversations for verification
for i, conv in enumerate(cleaned_conversations):
    print(f"Conversation {i + 1}:")
    print(f"Cleaned Question: {conv['question']}")
    print(f"Cleaned Answer: {conv['answer']}\n")
    if i >= 20:  # Print the first 5 cleaned conversations
        break

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Conversation 1:
Cleaned Question: are you a fan of google or microsoft
Cleaned Answer: both are excellent technology they are helpful in many ways for the security purpose both are super

Conversation 2:
Cleaned Question: im not a huge fan of google but i use it a lot because i have to i think they are a monopoly in some sense
Cleaned Answer: google provides online related services and products which includes online ads search engine and cloud computing

Conversation 3:
Cleaned Question: yeah their services are good im just not a fan of intrusive they can be on our personal lives
Cleaned Answer: google is leading the alphabet subsidiary and will continue to be the umbrella company for alphabet internet interest

Conversation 4:
Cleaned Question: did you know google had hundreds of live goats to cut the grass in the past
Cleaned Answer: it is very interesting google provide chrome os which is a light weight os google provided a lot of hardware mainly in 2010 to 2015

Conversation 5:
Cle

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Combine all cleaned questions and answers into a single list
all_text = [conv['question'] + ' ' + conv['answer'] for conv in cleaned_conversations]

# Initialize a Tokenizer
tokenizer = Tokenizer()  # Use <OOV> for out-of-vocabulary words

# Fit the tokenizer on the text
tokenizer.fit_on_texts(all_text)

# Add '<start>' and '<end>' tokens to the tokenizer's word_index
tokenizer.word_index['<start>'] = len(tokenizer.word_index) + 1
tokenizer.word_index['<end>'] = len(tokenizer.word_index) + 2

# Convert text to sequences of word indices
sequences = tokenizer.texts_to_sequences(all_text)

# Find the maximum sequence length
max_seq_length = max(len(seq) for seq in sequences)

# Pad sequences to make them of the same length
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post', truncating='post')

# Create input-output pairs for the encoder-decoder model
input_data = padded_sequences[:, :-1]  # Input is the question (remove the last token)
output_data = padded_sequences[:, 1:]   # Output is the answer (remove the first token)

# Convert input and output sequences to numpy arrays
input_data = np.array(input_data)
output_data = np.array(output_data)

# Print the vocabulary size
vocab_size = len(tokenizer.word_index)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 41913


In [6]:
max_seq_length

187

In [7]:
len(padded_sequences)
pad_max_seq_length = max(len(seq) for seq in padded_sequences)
pad_max_seq_length

187

In [8]:
len(sequences)

91174

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import math
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Enable mixed precision training
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Define hyperparameters
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for the OOV token
embedding_dim = 128
latent_dim = 256
batch_size = 48
epochs = 5

# Split the data into train and test sets
train_input, test_input, train_output, test_output = train_test_split(input_data, output_data, test_size=0.2, random_state=42)

def create_model(vocab_size, embedding_dim, latent_dim):
    encoder_inputs = Input(shape=(max_seq_length - 1,))
    encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
    encoder_masked = Masking(mask_value=0)(encoder_embedding)
    encoder_lstm = LSTM(latent_dim, return_state=True, dtype='float32')  # Specify dtype
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_masked)
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(max_seq_length - 1,))
    decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
    decoder_masked = Masking(mask_value=0)(decoder_embedding)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dtype='float32')  # Specify dtype
    decoder_outputs, _, _ = decoder_lstm(decoder_masked, initial_state=encoder_states)

    decoder_dense = Dense(vocab_size, activation='softmax')
    output = decoder_dense(decoder_outputs)

    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Create the model
model = create_model(vocab_size, embedding_dim, latent_dim)

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    history = model.fit(
        [train_input, train_output], train_output,
        validation_data=([test_input, test_output], test_output),
        batch_size=batch_size,
        epochs=1
    )

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate([test_input, test_output], test_output)

    # Calculate perplexity
    perplexity = math.exp(test_loss)

    # Calculate BLEU score and recall
    reference = ["both are excellent technology they are helpful in many ways for the security purpose both are super"]
    prediction = ["are you a fan of google or microsoft"]
    bleu_score = sentence_bleu([reference], prediction)

    def calculate_recall(reference, prediction):
        reference_words = set(reference.split())
        prediction_words = set(prediction.split())

        common_words = reference_words.intersection(prediction_words)

        recall = len(common_words) / len(reference_words) if len(reference_words) > 0 else 0.0

        return recall

    recall = calculate_recall(reference[0], prediction[0])

    # Print metrics after each epoch
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Perplexity: {perplexity:.4f}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"Recall: {recall:.4f}")

# Define the encoder model
encoder_inputs = Input(shape=(max_seq_length - 1,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_masked = Masking(mask_value=0)(encoder_embedding)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_masked)
encoder_states = [state_h, state_c]
encoder_model = Model(inputs=encoder_inputs, outputs=encoder_states)

# Define the decoder model
decoder_input_h = Input(shape=(latent_dim,))
decoder_input_c = Input(shape=(latent_dim,))
decoder_input_sequence = Input(shape=(1,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_input_sequence)
decoder_lstm_output, decoder_output_h, decoder_output_c = LSTM(latent_dim, return_sequences=True, return_state=True)(
    decoder_embedding, initial_state=[decoder_input_h, decoder_input_c]
)
decoder_output_sequence = Dense(vocab_size, activation='softmax')(decoder_lstm_output)
decoder_model = Model(
    inputs=[decoder_input_sequence, decoder_input_h, decoder_input_c],
    outputs=[decoder_output_sequence, decoder_output_h, decoder_output_c]
)

# Save the decoder weights
decoder_model.save_weights('decoder_weights.h5')

# Save the encoder weights
encoder_model.save_weights('encoder_weights.h5')


# Optionally, save the trained model
model.save('/content/drive/MyDrive/chatbot_model.h5')

Epoch 1/5
Test Loss: 0.4443
Test Accuracy: 0.9449
Perplexity: 1.5593
BLEU Score: 0.0000
Recall: 0.0714
Epoch 2/5
Test Loss: 0.1524
Test Accuracy: 0.9843
Perplexity: 1.1647
BLEU Score: 0.0000
Recall: 0.0714
Epoch 3/5
Test Loss: 0.0727
Test Accuracy: 0.9931
Perplexity: 1.0754
BLEU Score: 0.0000
Recall: 0.0714
Epoch 4/5

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your tokenizer (assuming you have it saved)
tokenizer = Tokenizer()

# Define max sequence length
max_seq_length = 187  # Define the same max_seq_length as used during training

# Function to preprocess input questions
def preprocess_input(question):
    question = question.lower()  # Convert to lowercase (you may need more preprocessing)
    question = tokenizer.texts_to_sequences([question])
    question = pad_sequences(question, maxlen=max_seq_length - 1, padding='post')
    return question


In [None]:
# Load the encoder model
encoder_inputs = tf.keras.layers.Input(shape=(max_seq_length - 1,))
encoder_embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_masked = tf.keras.layers.Masking(mask_value=0)(encoder_embedding)
encoder_lstm = tf.keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_masked)
encoder_states = [state_h, state_c]
encoder_model = tf.keras.models.Model(inputs=encoder_inputs, outputs=encoder_states)
encoder_model.load_weights('encoder_weights.h5')

# Load the decoder model
decoder_input_h = tf.keras.layers.Input(shape=(latent_dim,))
decoder_input_c = tf.keras.layers.Input(shape=(latent_dim,))
decoder_input_sequence = tf.keras.layers.Input(shape=(1,))
decoder_embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_input_sequence)
decoder_lstm_output, decoder_output_h, decoder_output_c = tf.keras.layers.LSTM(
    latent_dim, return_sequences=True, return_state=True
)(decoder_embedding, initial_state=[decoder_input_h, decoder_input_c])
decoder_output_sequence = tf.keras.layers.Dense(vocab_size, activation='softmax')(decoder_lstm_output)
decoder_model = tf.keras.models.Model(
    inputs=[decoder_input_sequence, decoder_input_h, decoder_input_c],
    outputs=[decoder_output_sequence, decoder_output_h, decoder_output_c]
)
decoder_model.load_weights('decoder_weights.h5')


In [None]:
def generate_response(input_question):
    input_question = preprocess_input(input_question)

    # Encode the input question
    encoder_states = encoder_model.predict(input_question)

    # Initialize the decoder input sequence with a start token (assuming 1 is the start token)
    target_seq = np.array([1])

    stop_condition = False
    response = ''

    while not stop_condition:
        # Predict the next word in the sequence
        output_tokens, h, c = decoder_model.predict([target_seq] + encoder_states)

        # Sample a word index from the output distribution
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Convert the index to a word
        sampled_word = tokenizer.index_word.get(sampled_token_index, '')

        # Break the loop if the sequence is too long or if the end token is generated
        if sampled_word == '' or len(response.split()) >= max_seq_length - 1:
            stop_condition = True
        else:
            response += sampled_word + ' '

        # Update the target sequence
        target_seq = np.array([sampled_token_index])

        # Update states for the next iteration
        encoder_states = [h, c]

    return response.strip()


In [None]:
user_question = "What is the weather today?"
response = generate_response(user_question)
print(response)


In [None]:
def postprocess_response(generated_response):
    # Convert the numerical sequence back to text using the vocabulary
    generated_text = ' '.join([tokenizer.index_word[index] for index in generated_response])

    # Perform postprocessing, such as capitalization and joining words
    generated_text = generated_text.capitalize()  # Capitalize the first letter
    generated_text = generated_text.replace(" i ", " I ")  # Capitalize the word "i"

    return generated_text

# Generate and postprocess a response
question = "How does this chatbot work?"
response = generate_response(question)
postprocessed_response = postprocess_response(response)

print("Question:", question)
print("Generated Response:", response)
print("Postprocessed Response:", postprocessed_response)

TypeError: ignored

In [None]:
def chatbot_interaction():
    print("Chatbot: Hi! How can I help you? (Type 'exit' to end the conversation)")

    while True:
        user_input = input("You: ")

        if user_input.lower() == 'exit':
            print("Chatbot: Goodbye!")
            break

        response = generate_response(user_input)
        postprocessed_response = postprocess_response(response)

        print("Chatbot:", postprocessed_response)

# Start the chatbot interaction
chatbot_interaction()

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def calculate_bleu_score(reference_corpus, generated_corpus):
    bleu_score = corpus_bleu(reference_corpus, generated_corpus)
    return bleu_score

# Example reference and generated responses for evaluation
reference_responses = [['this', 'is', 'a', 'reference', 'response']]
generated_responses = [['this', 'is', 'a', 'generated', 'response']]

# Calculate BLEU score
bleu_score = calculate_bleu_score(reference_responses, generated_responses)
print(f"BLEU Score: {bleu_score:.2f}")