In [1]:
import pandas as pd
import re

In [2]:
# Read the txt file
with open("human_chat.txt", "r", encoding = "utf-8") as file:
    lines = file.readlines()

# Initialize lists to store human 1 and human 2 messages
human_1 = []
human_2 = []

# Iterate through lines and separate dialogues
for line in lines:
    line = line.strip()
    if line.startswith("Human 1:"):
        human_1.append(line.replace("Human 1:", "").strip())
        human_2.append("")
    elif line.startswith("Human 2:"):
        human_2[-1] = line.replace("Human 2:", "").strip()
# Create a DataFrame
data = pd.DataFrame({"Human_1": human_1, "Human_2": human_2})
# Save to csv
data.to_csv("human_chat.csv", index = False)

print("CSV file saved successfully..!")

CSV file saved successfully..!


In [3]:
def wrangle(filepath):
    data = pd.read_csv(filepath)

    data.drop_duplicates(inplace = True)

    # Fill missing responses with "No response"
    data.fillna("No response", inplace = True)
    # Drop rows where both columns are empty
    data = data[~((data["Human_1"] == "") & (data["Human_2"] == ""))]

    return data

In [4]:
data = wrangle('human_chat.csv')
data.head(10)

Unnamed: 0,Human_1,Human_2
0,Hi!,What is your favorite holiday?
1,one where I get to meet lots of different people.,What was the most number of people you have ev...
2,Hard to keep a count. Maybe 25.,Which holiday was that?
3,I think it was Australia,Do you still talk to the people you met?
4,Not really. The interactions are usually short...,"Yea, me too. I feel like God often puts strang..."
5,what do you mean?,"I think it's like a 6th sense, often seen as ""..."
6,"Wow! That's interesting, borderline spooky","There's this practice called ""Treasure Hunting..."
7,"So, do you do treasure hunting often?",I did more when I was in grad school (and had ...
8,Hi,Any plans for the weekend?
9,my friends are gonna visit me this weekend. we...,That's great! How's the weather over the weeke...


In [5]:
print(data.shape)
print(data.duplicated().sum())
print(data.isnull().sum())

(747, 2)
0
Human_1    0
Human_2    0
dtype: int64


In [6]:
shortcut_map = {
    "i'm": "i am", "we've": "we have",
    "i've": "i have", "didn't": "did not",
    "it's": "it is", "shouldn't": "should not",
    "weren't": "were not", "wouldn't": "would not",
    "that's": "that is", "he's": "he is",
    "there's": "there is", "she's": "she is",
    "how's": "how is", "couldn't": "could not",
    "what's": "what is", "doesn't": "does not",
    "it'd": "it would", "it'll": "it will",
    "let's": "let us", "can't": "can not",
    "i'd": "i would", "they're": "they are",
    "don't": "do not", "you're": "you are",
    "won't": "will not", "haven't": "have not",
    "isn't": "is not", "i'll": "i will",
    "wasn't": "was not",
}

In [7]:
# Function to replace shortcuts in a text
def replace_shortcuts(text, shortcut_map):
    # Create a regex pattern to match keys in the dictionary
    pattern = re.compile(r'\b(' + '|'.join(map(re.escape, shortcut_map.keys())) + r')\b', flags = re.IGNORECASE)

    # Replace using the dictionary mapping
    def replace_match(match):
        word = match.group(0).lower()
        return shortcut_map.get(word, word)
    
    return pattern.sub(replace_match, text)
# Test the function
text = "I'm happy that you're here. It's a great day!"
new_text = replace_shortcuts(text, shortcut_map)
print(new_text)

i am happy that you are here. it is a great day!


In [8]:
# Apply the shortcut map to the DataFrame
# data["Human 1"] = data["Human 1"].apply(lambda x: replace_shortcuts(x, shortcut_map))
# data["Human 2"] = data["Human 2"].apply(lambda x: replace_shortcuts(x, shortcut_map))

In [9]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    text = text.lower()
    text = replace_shortcuts(text, shortcut_map)
    text = re.sub(r"[^a-z0-9\s.,!?]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning to DataFrame Columns
data["Human_1"] = data["Human_1"].apply(clean_text)
data["Human_2"] = data["Human_2"].apply(clean_text)

In [10]:
data.head(10)

Unnamed: 0,Human_1,Human_2
0,hi!,what is your favorite holiday?
1,one where i get to meet lots of different people.,what was the most number of people you have ev...
2,hard to keep a count. maybe 25.,which holiday was that?
3,i think it was australia,do you still talk to the people you met?
4,not really. the interactions are usually short...,"yea, me too. i feel like god often puts strang..."
5,what do you mean?,"i think it is like a 6th sense, often seen as ..."
6,"wow! that is interesting, borderline spooky",there is this practice called treasure hunting...
7,"so, do you do treasure hunting often?",i did more when i was in grad school and had m...
8,hi,any plans for the weekend?
9,my friends are gonna visit me this weekend. we...,that is great! how is the weather over the wee...


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize vectorizer
vectorizer = CountVectorizer()

# Transform both columns separately
X_human1 = vectorizer.fit_transform(data["Human_1"])
X_human2 = vectorizer.transform(data["Human_2"])

# Convert to DataFrame
bow_human1_data = pd.DataFrame(X_human1.toarray(), columns = vectorizer.get_feature_names_out())
bow_human2_data = pd.DataFrame(X_human2.toarray(), columns = vectorizer.get_feature_names_out())

print(bow_human1_data.head())
print(bow_human2_data.head())

   10  11  114  12  20  2019  23  23rd  24  25  ...  yours  yourself  youtube  \
0   0   0    0   0   0     0   0     0   0   0  ...      0         0        0   
1   0   0    0   0   0     0   0     0   0   0  ...      0         0        0   
2   0   0    0   0   0     0   0     0   0   1  ...      0         0        0   
3   0   0    0   0   0     0   0     0   0   0  ...      0         0        0   
4   0   0    0   0   0     0   0     0   0   0  ...      0         0        0   

   yrs  yum  yup  zero  zipline  ziplining  zucchini  
0    0    0    0     0        0          0         0  
1    0    0    0     0        0          0         0  
2    0    0    0     0        0          0         0  
3    0    0    0     0        0          0         0  
4    0    0    0     0        0          0         0  

[5 rows x 1800 columns]
   10  11  114  12  20  2019  23  23rd  24  25  ...  yours  yourself  youtube  \
0   0   0    0   0   0     0   0     0   0   0  ...      0         0        0

In [12]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential

# Define constants
VOCAB_SIZE = 10000
EMBEDDING_DIM = 256
SEQUENCE_LENGTH = 20

# Define a simple Seq2Seq model
model = Sequential([
    Embedding(input_dim = VOCAB_SIZE, output_dim = EMBEDDING_DIM, input_length = SEQUENCE_LENGTH),
    LSTM(512, return_sequences = True),
    LSTM(512),
    Dense(VOCAB_SIZE, activation = "softmax")
])

model.build(input_shape = (None, SEQUENCE_LENGTH))

model.compile(loss = "categorical_crossentropy", optimizer = "adam")
model.summary()

2025-01-30 02:33:41.705003: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Example dataset
# Human_2 = ["Any plans for the weekend?", "Right, very anxious! where do you plan to go for a hike?"]
# Human_1 = ["my friends are gonna visit me this weekend. we might go hiking!", "I am going to Diablo!"]

# Initialize Tokenizer
VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words = VOCAB_SIZE)
tokenizer.fit_on_texts(data['Human_1'] + data['Human_2'])

# Convert Text to Sequences
X_train = tokenizer.texts_to_sequences(data['Human_1'])
y_train = tokenizer.texts_to_sequences(data['Human_2'])

# Padding
SEQUENCE_LENGTH = 20
X_train = pad_sequences(X_train, maxlen = SEQUENCE_LENGTH, padding = "post")
y_train = pad_sequences(y_train, maxlen = SEQUENCE_LENGTH, padding = "post")

print("Example Encoded Input:", X_train[0])


Example Encoded Input: [31  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [20]:
def debug_tokenization(input_text):
    # Check tokenization step
    input_seq = tokenizer.texts_to_sequences([input_text])
    print("Tokenized Input:", input_seq)

    # Ensure padding works correctly
    input_seq_padded = pad_sequences(input_seq, maxlen = SEQUENCE_LENGTH, padding = 'post')
    print('Padded Input:', input_seq_padded)

    return input_seq_padded

debug_tokenization("Hello, how are you?")

Tokenized Input: [[149, 20, 18, 2]]
Padded Input: [[149  20  18   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]]


array([[149,  20,  18,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [14]:
from tensorflow.keras.utils import to_categorical
import numpy as np

# Convert y_train to NumPy
y_train = np.array(y_train)
# Fix the shape
y_train = y_train[:, -1]

# Convert to one-hot encoding
y_train = to_categorical(y_train, num_classes = VOCAB_SIZE)
print(y_train.shape)

(747, 10000)


In [15]:
# Train the model
model.fit(X_train, y_train, epochs = 50, batch_size = 32, verbose = 1)

Epoch 1/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 426ms/step - loss: 5.9369
Epoch 2/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 363ms/step - loss: 2.1241
Epoch 3/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 347ms/step - loss: 1.9205
Epoch 4/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 314ms/step - loss: 1.7696
Epoch 5/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 311ms/step - loss: 1.6795
Epoch 6/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 345ms/step - loss: 1.8283
Epoch 7/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 329ms/step - loss: 1.6193
Epoch 8/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 324ms/step - loss: 1.7060
Epoch 9/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 374ms/step - loss: 1.6254
Epoch 10/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 328ms/step - l

<keras.src.callbacks.history.History at 0x7f7a3373ffd0>

In [21]:
def debug_model_output(input_seq_padded):
    # Get prediction
    predicted_probs = model.predict(input_seq_padded)
    print("Predicted Probabilities:", predicted_probs)

    return predicted_probs

input_seq_padded = debug_tokenization("Hello, how are you?")
debug_model_output(input_seq_padded)

Tokenized Input: [[149, 20, 18, 2]]
Padded Input: [[149  20  18   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step
Predicted Probabilities: [[7.9757100e-01 9.3082164e-09 1.5308521e-02 ... 1.2180631e-08
  9.7248325e-09 1.0302393e-08]]


array([[7.9757100e-01, 9.3082164e-09, 1.5308521e-02, ..., 1.2180631e-08,
        9.7248325e-09, 1.0302393e-08]], dtype=float32)

In [23]:
def debug_index_to_word(predicted_index):
    word_index = tokenizer.word_index
    index_word = {v: k for k, v in word_index.items()}
    print("Index to Word Mapping:", index_word)

    predicted_word = index_word.get(predicted_index[0], "")
    print("Predicted Word:", predicted_word)

    return predicted_word

predicted_probs = debug_model_output(input_seq_padded)
predicted_index = np.argmax(predicted_probs, axis = 1)
debug_index_to_word(predicted_index)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Predicted Probabilities: [[7.9757100e-01 9.3082164e-09 1.5308521e-02 ... 1.2180631e-08
  9.7248325e-09 1.0302393e-08]]
Index to Word Mapping: {1: 'i', 2: 'you', 3: 'to', 4: 'the', 5: 'a', 6: 'is', 7: 'it', 8: 'of', 9: 'do', 10: 'have', 11: 'that', 12: 'and', 13: 'not', 14: 'what', 15: 'in', 16: 'for', 17: 'am', 18: 'are', 19: 'like', 20: 'how', 21: 'but', 22: 'so', 23: 'good', 24: 'about', 25: 'my', 26: 'your', 27: 'be', 28: 'was', 29: 'with', 30: 'on', 31: 'hi', 32: 'think', 33: 'going', 34: 'just', 35: 'can', 36: 'up', 37: 'this', 38: 'there', 39: 'would', 40: 'time', 41: 'we', 42: 'at', 43: 'me', 44: 'will', 45: 'oh', 46: 'too', 47: 'some', 48: 'work', 49: 'no', 50: 'very', 51: 'did', 52: 'nice', 53: 'all', 54: 'really', 55: 'as', 56: 'been', 57: 'day', 58: 'lot', 59: 'haha', 60: 'great', 61: 'fun', 62: 'any', 63: 'yes', 64: 'they', 65: 'pretty', 66: 'out', 67: 'get', 68: 'if', 69: 'know', 70: 'well', 71: 'one',

''

In [17]:
import numpy as np

def generate_response(input_text):
    # Convert user input to sequence
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen = SEQUENCE_LENGTH, padding = "post")

    # Predict response
    predicted_seq = model.predict(input_seq)
    predicted_word_index = np.argmax(predicted_seq, axis = -1)

    # Convert numbers back to words
    word_index = tokenizer.word_index
    index_word = {v: k for k, v in word_index.items()}
    predicted_word = index_word.get(predicted_word_index[0], "")

    return predicted_word

# Test the Chatbot
user_input = "Hello, how are you?"
response = generate_response(user_input)
print("Chatbot Response:", response)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Chatbot Response: 


In [24]:
def generate_response(input_text):
    print("User Input", input_text)

    # Tokenization
    input_seq_padded = debug_tokenization(input_text)

    # Make Predictions
    predicted_probs = debug_model_output(input_seq_padded)

    # Extract word
    predicted_index = np.argmax(predicted_probs, axis = -1)
    print("Predicted Index:", predicted_index)

    predicted_word = debug_index_to_word(predicted_index)

    return predicted_word

# Test with a user input
response = generate_response("Hello, how are you?")
print("Chatbot Response:", response)

User Input Hello, how are you?
Tokenized Input: [[149, 20, 18, 2]]
Padded Input: [[149  20  18   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Predicted Probabilities: [[7.9757100e-01 9.3082164e-09 1.5308521e-02 ... 1.2180631e-08
  9.7248325e-09 1.0302393e-08]]
Predicted Index: [0]
Index to Word Mapping: {1: 'i', 2: 'you', 3: 'to', 4: 'the', 5: 'a', 6: 'is', 7: 'it', 8: 'of', 9: 'do', 10: 'have', 11: 'that', 12: 'and', 13: 'not', 14: 'what', 15: 'in', 16: 'for', 17: 'am', 18: 'are', 19: 'like', 20: 'how', 21: 'but', 22: 'so', 23: 'good', 24: 'about', 25: 'my', 26: 'your', 27: 'be', 28: 'was', 29: 'with', 30: 'on', 31: 'hi', 32: 'think', 33: 'going', 34: 'just', 35: 'can', 36: 'up', 37: 'this', 38: 'there', 39: 'would', 40: 'time', 41: 'we', 42: 'at', 43: 'me', 44: 'will', 45: 'oh', 46: 'too', 47: 'some', 48: 'work', 49: 'no', 50: 'very', 51: 'did', 52: 'nice', 53: 'all', 54: 'really', 55: 'a