Data preprocessing

In [1]:
#Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import pickle

from src.utils.process_text import clean_text
from src.transformers.tokenizer import tokenize

2023-09-26 22:21:36.131717: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#Load the data

#data
questions  =[]
answers = []
with open("../../../../../data/raw/chatbot/dialogs.txt",'r') as f :
    for line in f :
        line  =  line.split('\t')
        questions.append(line[0])
        answers.append(line[1])

In [3]:
questions

['hi, how are you doing?',
 "i'm fine. how about yourself?",
 "i'm pretty good. thanks for asking.",
 'no problem. so how have you been?',
 "i've been great. what about you?",
 "i've been good. i'm in school right now.",
 'what school do you go to?',
 'i go to pcc.',
 'do you like it there?',
 "it's okay. it's a really big campus.",
 'good luck with school.',
 "how's it going?",
 "i'm doing well. how about you?",
 'never better, thanks.',
 'so how have you been lately?',
 "i've actually been pretty good. you?",
 "i'm actually in school right now.",
 'which school do you attend?',
 "i'm attending pcc right now.",
 'are you enjoying it there?',
 "it's not bad. there are a lot of people there.",
 'good luck with that.',
 'how are you doing today?',
 "i'm doing great. what about you?",
 "i'm absolutely lovely, thank you.",
 "everything's been good with you?",
 "i haven't been better. how about yourself?",
 'i started school recently.',
 'where are you going to school?',
 "i'm going to pcc.

In [4]:
answers

["i'm fine. how about yourself?\n",
 "i'm pretty good. thanks for asking.\n",
 'no problem. so how have you been?\n',
 "i've been great. what about you?\n",
 "i've been good. i'm in school right now.\n",
 'what school do you go to?\n',
 'i go to pcc.\n',
 'do you like it there?\n',
 "it's okay. it's a really big campus.\n",
 'good luck with school.\n',
 'thank you very much.\n',
 "i'm doing well. how about you?\n",
 'never better, thanks.\n',
 'so how have you been lately?\n',
 "i've actually been pretty good. you?\n",
 "i'm actually in school right now.\n",
 'which school do you attend?\n',
 "i'm attending pcc right now.\n",
 'are you enjoying it there?\n',
 "it's not bad. there are a lot of people there.\n",
 'good luck with that.\n',
 'thanks.\n',
 "i'm doing great. what about you?\n",
 "i'm absolutely lovely, thank you.\n",
 "everything's been good with you?\n",
 "i haven't been better. how about yourself?\n",
 'i started school recently.\n',
 'where are you going to school?\n',
 "

In [5]:
answers = [ i.replace("\n","") for i in answers]

In [6]:
data = pd.DataFrame({"question" : questions ,"answer":answers})
data.head()

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [7]:
print(data['question'][200])
print(data['answer'][200])

i hope you feel better.
thank you.


In [8]:
data["question"] = data.question.apply(clean_text)
data["answer"] = data.answer.apply(clean_text)

In [9]:
tokenizer_q = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data['question'], target_vocab_size=2**13)
tokenizer_a = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data['answer'], target_vocab_size=2**13)

In [10]:
#Define vocabulary size
VOCAB_SIZE_Q = tokenizer_q.vocab_size + 2
VOCAB_SIZE_A = tokenizer_a.vocab_size + 2

In [11]:
input_data = [[VOCAB_SIZE_Q-2] + tokenizer_q.encode(sentence) + [VOCAB_SIZE_Q-1]
          for sentence in data['question']]
target_data = [[VOCAB_SIZE_A-2] + tokenizer_a.encode(sentence) + [VOCAB_SIZE_A-1]
           for sentence in data['answer']]

In [13]:
all_encoded_sequences = input_data + target_data

MAX_LENGTH = max(len(sequence) for sequence in all_encoded_sequences)

In [14]:
MAX_LENGTH

22

In [15]:
#Padding
input_data = tf.keras.preprocessing.sequence.pad_sequences(input_data,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

In [16]:
#Shuffle data
BATCH_SIZE = 64
BUFFER_SIZE = len(input_data)

dataset = tf.data.Dataset.from_tensor_slices((input_data, target_data))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [17]:
dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 22), dtype=tf.int32, name=None), TensorSpec(shape=(None, 22), dtype=tf.int32, name=None))>

In [19]:
SAVE_PATH = "../../../../../data/processed/chatbot/english/dataset.tfrecord"
SHAPE = (None, 22)

tf.data.experimental.save(dataset, SAVE_PATH)

In [22]:
#Save tokenizers
with open('../../../../../exports/chatbot/english/tokenizer_q.pkl', 'wb') as f:
    pickle.dump(tokenizer_q, f)

with open('../../../../../exports/chatbot/english/tokenizer_a.pkl', 'wb') as f:
    pickle.dump(tokenizer_a, f)