In [None]:
# Import libraries

import os
import sys
import pandas as pd
import numpy as np
import nltk
from nltk.stem import  WordNetLemmatizer
import random
import pickle

nltk.download('wordnet')

In [None]:
# Load the prepared conversations DataFrame

DATA_INTERIM_PATH = os.path.join(os.getcwd(), '../data/interim')
DATA_PROCESSED_PATH = os.path.join(os.getcwd(), '../data/processed')

conv_df = pd.read_csv(os.path.join(os.getcwd(), DATA_INTERIM_PATH, 'preprocessed_conversations.csv'))

conv_df.dropna(inplace=True)

In [None]:
print(conv_df.isna().sum())  # Check for any remaining NaN values
conv_df.head() # Display the first few rows of the DataFrame

In [None]:
# Lemmatize the text in the 'text1' and 'text2' column of the DataFrame

lemma = WordNetLemmatizer()

def lemmatize_text(text):
    """Lemmatize the input text."""
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = []
    for word, tag in nltk.pos_tag(tokens):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'n', 'v'] else None
        if wntag:
            lemmatized_tokens.append(lemma.lemmatize(word, pos=wntag))
        else:
            lemmatized_tokens.append(word)
    return ' '.join(lemmatized_tokens)

conv_df["text1"] = conv_df["text1"].apply(lemmatize_text)
conv_df["text2"] = conv_df["text2"].apply(lemmatize_text)

In [None]:
# Now we create a Vocabulary of the words in the conversations

PAD_token = 0
SOS_token = 1
EOS_token = 2

class Vocabulary:
    def __init__(self):
        self.trimmed = False
        self.reset_vocab()

    def reset_vocab(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def trim(self, min_count=1):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []
        for word, count in self.word2count.items():
            if count >= min_count:
                keep_words.append(word)

        self.reset_vocab()
        for word in keep_words:
            self.add_word(word)

    def __len__(self):
        return self.num_words
    
    def __getitem__(self, item):
        if isinstance(item, str):
            return self.word2index.get(item, None)
        elif isinstance(item, int):
            return self.index2word.get(item, None)
        else:
            raise TypeError("Item must be either a string or an integer.")
        
    def __contains__(self, item):
        if isinstance(item, str):
            return item in self.word2index
        elif isinstance(item, int):
            return item in self.index2word
        else:
            raise TypeError("Item must be either a string or an integer.")
        
vocab = Vocabulary()
print("Vocabulary initialized with {} words.".format(len(vocab)))
print("Vocabulary contains the following special tokens:", vocab[0], vocab[1], vocab[2])

In [None]:
# Add the text from the DataFrame to the vocabulary

def add_text_to_vocab(text):
    """
    Adds the text to the vocabulary.
    """
    vocab.add_sentence(text)

conv_df["text1"].apply(add_text_to_vocab)
conv_df["text2"].apply(add_text_to_vocab)

print("Vocabulary size after adding text:", len(vocab))

idx = random.randint(0, len(vocab) - 1)
print("Random word in vocabulary:", vocab[idx])

In [None]:
# Trim the vocabulary to only include words that appear at least MIN_COUNT times

MIN_COUNT = 3
def trim_vocab(vocab, min_count=MIN_COUNT):
    """
    Trims the vocabulary to only include words that appear at least MIN_COUNT times.
    """
    vocab.trim(min_count)
    print("Vocabulary trimmed to {} words.".format(len(vocab)))

    # Now update the DataFrame to remove words not in the trimmed vocabulary
    def filter_text(text):
        return ' '.join([word for word in text.split() if word in vocab])
    
    conv_df_trimmed = conv_df.copy()
    conv_df_trimmed["text1"] = conv_df_trimmed["text1"].apply(filter_text)
    conv_df_trimmed["text2"] = conv_df_trimmed["text2"].apply(filter_text)

    conv_df_trimmed = conv_df_trimmed[(conv_df_trimmed["text1"].str.strip() != "") & (conv_df_trimmed["text2"].str.strip() != "")]

    return conv_df_trimmed

# Trim the vocabulary and update the DataFrame
conv_df_trimmed = trim_vocab(vocab, MIN_COUNT)

# Vocabulary size after trimming: 23570

In [None]:
# Convert the text in the DataFrame to vectors of indices based on the vocabulary

def vectorize_text(text, vocab):
    """
    Converts a text to a vector of indices based on the vocabulary.
    """
    return [vocab[word] for word in text.split() if word in vocab]

conv_df_trimmed["text1_vectorized"] = conv_df_trimmed["text1"].apply(lambda x: vectorize_text(x, vocab))
conv_df_trimmed["text2_vectorized"] = conv_df_trimmed["text2"].apply(lambda x: vectorize_text(x, vocab))

conv_df_vectorized = conv_df_trimmed[["text1_vectorized", "text2_vectorized"]].copy()

In [None]:
# Pad sequences to a fixed length (for training purposes)

def pad_sequence(sequence, max_length, pad_value=PAD_token):
    """
    Pads a sequence to the specified max_length with the pad_value.
    """
    return sequence + [pad_value] * (max_length - len(sequence))

max_length = max(conv_df_vectorized["text1_vectorized"].apply(len).max(), conv_df_vectorized["text2_vectorized"].apply(len).max())

print("Maximum sequence length for padding:", max_length)

conv_df_vectorized["text1_vectorized"] = conv_df_vectorized["text1_vectorized"].apply(lambda x: pad_sequence(x, max_length))
conv_df_vectorized["text2_vectorized"] = conv_df_vectorized["text2_vectorized"].apply(lambda x: pad_sequence(x, max_length))

In [None]:
# Rename the columns for clarity
conv_df_vectorized.rename(columns={"text1_vectorized": "seq1", "text2_vectorized": "seq2"}, inplace=True)

# Before saving, we convert the list columns to string format for better compatibility with CSV
conv_df_vectorized["seq1"] = conv_df_vectorized["seq1"].apply(lambda x: ','.join(map(str, x)))
conv_df_vectorized["seq2"] = conv_df_vectorized["seq2"].apply(lambda x: ','.join(map(str, x)))

conv_df_vectorized.head()

In [None]:
# Save the processed DataFrame and vocabulary to files

conv_df_vectorized.to_csv(os.path.join(os.getcwd(), DATA_PROCESSED_PATH, 'conversations_vectorized.csv'), index=False)

with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)