In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [None]:
# Step 1: Load and Preprocess Data
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/topical_chat.csv')
data.head()

Unnamed: 0,conversation_id,message,sentiment
0,1,Are you a fan of Google or Microsoft?,Curious to dive deeper
1,1,Both are excellent technology they are helpfu...,Curious to dive deeper
2,1,"I'm not a huge fan of Google, but I use it a...",Curious to dive deeper
3,1,Google provides online related services and p...,Curious to dive deeper
4,1,"Yeah, their services are good. I'm just not a...",Curious to dive deeper


In [None]:
# Clean and preprocess the text data
import string

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = ' '.join(text.split())
    return text

data['message'] = data['message'].apply(preprocess_text)

In [None]:
# Create input-output pairs based on conversation_id
input_messages = []
output_messages = []

conversation_ids = data['conversation_id'].unique()
for conv_id in conversation_ids:
    conv_data = data[data['conversation_id'] == conv_id]
    input_msg = conv_data.iloc[0]['message']
    output_msg = conv_data.iloc[1]['message']
    input_messages.append(input_msg)
    output_messages.append(output_msg)

In [None]:
# Step 2: Tokenize and Pad Sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_messages + output_messages)

input_seq = tokenizer.texts_to_sequences(input_messages)
output_seq = tokenizer.texts_to_sequences(output_messages)

max_seq_length = max(len(seq) for seq in input_seq + output_seq)

input_seq = pad_sequences(input_seq, maxlen=max_seq_length, padding='post')
output_seq = pad_sequences(output_seq, maxlen=max_seq_length, padding='post')
# Convert output_seq to sequences of integers
output_seq = np.array(output_seq)

# Ensure the target labels are of shape (num_samples, max_seq_length)
# In your case, num_samples should be the same as the number of input samples.


In [None]:
# Step 3: Build the Chatbot Model
vocab_size = len(tokenizer.word_index) + 1

input_layer = Input(shape=(max_seq_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=128)(input_layer)
lstm_layer = LSTM(128)(embedding_layer)
# output_layer = Dense(vocab_size, activation='softmax')(lstm_layer)
output_layer = Dense(vocab_size, activation='softmax')(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
import numpy as np

# Ensure output_seq is a NumPy array
output_seq = np.array(output_seq)

# One-hot encode output_seq
output_seq_onehot = np.zeros((output_seq.shape[0], max_seq_length, vocab_size), dtype=np.float32)

for i, sequence in enumerate(output_seq):
    for j, token_id in enumerate(sequence):
        output_seq_onehot[i, j, token_id] = 1.0

# Now, you can use it in model.fit
model.fit(input_seq, output_seq_onehot, epochs=5, batch_size=64)


In [None]:
# Step 5: Create a Chatbot Interface (simplified example)
while True:
    user_input = input("You: ")
    user_input_seq = tokenizer.texts_to_sequences([user_input])
    user_input_seq = pad_sequences(user_input_seq, maxlen=max_seq_length, padding='post')

    response_seq = model.predict(user_input_seq)
    response = "Bot: " + tokenizer.sequences_to_texts([response_seq.argmax(axis=-1)])[0]

    print(response)

Bot: treatment
Bot: treatment
Bot: treatment
