<a href="https://colab.research.google.com/github/Jiaweihu08/Chatbot/blob/master/dataset_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extracting utterances from datasets

These are the dialog dataset from which the training and evaluation utterances are extracted.

1. **Daily Dialogues**
2. **ConvAI**
3. **Empathetic Dialogues**
4. **Persona Chat**
5. **Cornell Movies Dataset**

Conversations are stored in different ways in different datasets, and a function is defined for each to extract the conversations in the form of **[l1, l2, l3, l4, ..., ln]**, where li represents a particular utterance.

The obtained conversations are then converted into utterances for training and evaluation. For a given conversation [l1, l2, l3, ..., ln], we extracte utterance pairs of the form **[l1, l2], [l2, l3], ..., [ln-1, ln]**. In each pair, the first line is called **message** and is used as model input, and the second line is called **response** and used as the correct model output.

The message and response pairs extracted from each dataset are combined at the end. The **tokenizer** is fit to all the unique utterances obtained (**set(messages + responses)**). **22.0426** utterance pairs are obtained from the above datasets, **10.000** of which are used as evaluation set.

Both training and evaluation sets are stored in separate txt files.

(CCPE and Holl-E are datasets that mainly focus on movie reviews, thus are excluded.)

In [None]:
import tensorflow as tf

import os
import time
import json
import re

from sklearn.model_selection import train_test_split

MAX_LEN = 14
BUFFER_SIZE = 150000
VOCAB_SIZE = 13199 # Eliminating words that appear less than 3 times
BATCH_SIZE = 32

root_path = '/content/drive/MyDrive/Colab Notebooks/Chatbots/version-2'
path_to_datasets = os.path.join(root_path, 'datasets')

In [None]:
file_names = ['training file', 'test file', 'validation file']

def preprocess_text(text):
    text = re.sub(r'([.,?!])', r' \1 ', text)

    text = re.sub(r"[^a-zA-Z0-9:,.?!]", ' ', text)

    text = re.sub(r"\s+", ' ', text)

    text = text.strip()

    text = text.lower()

    text = '<start> ' + text + ' <end>'

    return text

In [None]:
# -------------------- Daily Dialogues --------------------
def load_process_and_filter_DD(path_to_file, max_len=MAX_LEN):
    with open(path_to_file, 'r') as f:
        lines = f.read().strip().split('\n')
    
    messages = []
    responses = []

    for line in lines:
        conv = line.strip().split('__eou__')[:-1]
        conv = list(map(preprocess_text, conv))
        for i in range(len(conv)-1):
            m, r = conv[i], conv[i+1]
            if m and r and 2 < len(m.split()) <= max_len and 2 < len(r.split()) <= max_len:
                messages.append(m)
                responses.append(r)

    assert len(messages) == len(responses)
    print(f'- number of utterances: {len(messages)}\n')
    
    return messages, responses


# -------------------- ConvAI --------------------
def load_process_and_filter_CA(file_path, max_len=MAX_LEN):
    with open(file_path, 'r') as f:
        data = json.load(f)

    conversations = []

    for element in data:
        skip = False
        for user in element['users']:
            if user['userType'] == 'Bot':
                skip = True
                break
        
        if skip == False and len(element['thread']) > 1:
            conversations.append([preprocess_text(thread['text']) for thread in element['thread']])

    print(f'- number of conversations between human users: {len(conversations)}')

    messages, responses = [], []

    for conv in conversations:
        for i in range(len(conv)-1):
            m, r = conv[i], conv[i+1]
            if m and r and 2 < len(m.split()) <= max_len and 2 < len(r.split()) <= max_len:
                messages.append(m)
                responses.append(r)

    assert len(messages) == len(responses)

    print(f'- number of utterances: {len(messages)}\n')
    return messages, responses


# -------------------- Empathetic Dialogues --------------------
def load_process_and_filter_ED(file_path, max_len=MAX_LEN):
    with open(file_path, 'r') as f:
        data = f.read().strip().split('\n')

    conversations = []
    conversation = []
    utterance_id = 0
    for row in data[1:]:
        row = row.split(',')
        id, utterance = int(row[1]), preprocess_text(row[5])
        if id == utterance_id + 1:
            conversation.append(utterance)
            utterance_id += 1
        else:
            conversations.append(conversation)
            conversation = []
            utterance_id = 1

    messages = []
    responses = []
    for conv in conversations:
        for i in range(len(conv) - 1):
            m = conv[i]
            r = conv[i + 1]
            if m and r and 2 < len(m.split()) <= max_len and 2 < len(r.split()) <= max_len:
                messages.append(m)
                responses.append(r)
    
    assert len(messages) == len(responses)

    print(f'- number of conversations: {len(conversations)}')
    print(f'- number of utterances: {len(messages)}\n')

    return messages, responses


# -------------------- Persona Chat --------------------
def load_process_and_filter_PC(path_to_file, max_len=MAX_LEN):
    with open(path_to_file) as f:
        data = json.load(f)
    
    data['train'].extend(data['valid'])
    print(f"- number of conversations: {len(data['train'])}")
    messages = []
    responses = []
    for conv in data['train']:
        conv = [preprocess_text(utter) for utter in conv['utterances'][-1]['history']]
        for i in range(len(conv) - 1):
            m, r = conv[i], conv[i+1]
            if m and r and 2 < len(m.split()) <= max_len and 2 < len(r.split()) <= max_len:
                messages.append(m)
                responses.append(r)
    
    assert len(messages) == len(responses)

    print(f'- number of utterances: {len(messages)}\n')

    return messages, responses


# -------------------- Cornell Movies Dataset --------------------
def load_source_data(path_to_convs, path_to_lines):
    with open(path_to_convs, encoding='iso-8859-1') as f:
        convs = f.read().strip().split('\n')

    convs = [re.findall(r'L\d+', conv) for conv in convs]
    
    with open(path_to_lines, encoding='iso-8859-1') as f:
        lines = f.read().strip().split('\n')
    
    line_dict = dict()
    for line in lines:
        line = line.split(' +++$+++ ')
        text = preprocess_text(line[-1])

        line_dict[line[0]] = text

    return convs, line_dict


def create_conv_pairs(path_to_convs, path_to_lines, max_len=MAX_LEN):
    convs, line_dict = load_source_data(path_to_convs, path_to_lines)
    
    messages = []
    responses = []
    for turns in convs:
        for i in range(len(turns)-1):
            m = line_dict[turns[i]]
            r = line_dict[turns[i+1]]
            if m and r and 2 < len(m.split()) <= max_len and 2 < len(r.split()) <= max_len:
                messages.append(m)
                responses.append(r)
    
    assert len(messages) == len(responses)
    print(f'- number of utterances: {len(messages)}')
    return messages, responses


# # -------------------- Holl-E --------------------
# def load_process_and_filter_HE(path_to_file, max_len=MAX_LEN):
#     with open(path_to_file) as f:
#         data = json.load(f)

#     print(f'- number of conversations: {len(data)}')

#     messages = []
#     responses = []
#     for chat in data:
#         conv = list(map(preprocess_text, chat['chat']))
#         for i in range(len(conv) - 1):
#             m, r = conv[i], conv[i+1]
#             if m and r and 2 < len(m.split()) <= max_len and 2 < len(r.split()) <= max_len:
#                 messages.append(m)
#                 responses.append(r)
    
#     assert len(messages) == len(responses)
    
#     print(f'- number of utterances: {len(messages)}\n')
    
#     return messages, responses


# # -------------------- CCPE --------------------
# def load_process_and_filter_CCPE(path_to_file, max_len=MAX_LEN):
#     with open(path_to_file) as f:
#         data = json.load(f)
    
#     print(f'- number of conversations: {len(data)}')
#     messages = []
#     responses = []
#     for row in data:
#         convs = [preprocess_text(utter['text']) for utter in row['utterances']]
#         for i in range(len(convs) - 1):
#             m , r = convs[i], convs[i + 1]
#             if m and r and 2 < len(m.split()) <= max_len and 2 < len(r.split()) <= max_len:
#                 messages.append(m)
#                 responses.append(r)
#     assert len(messages) == len(responses)

#     print(f'- number of utterances: {len(messages)}\n')

#     return messages, responses

### Extracting utterances and saving to files

In [None]:
print('-------------------- Daily Dialogues --------------------')
train_path_DD = os.path.join(path_to_datasets, 'EMNLP_dataset/train/dialogues_train.txt')
test_path_DD = os.path.join(path_to_datasets, 'EMNLP_dataset/test/dialogues_test.txt')
valid_path_DD = os.path.join(path_to_datasets, 'EMNLP_dataset/validation/dialogues_validation.txt')


file_paths_DD = [train_path_DD, test_path_DD, valid_path_DD]

DD_messages, DD_responses = [], []

for file_name, file_path in zip(file_names, file_paths_DD):
    print(f'Loading from {file_name}...')
    messages, responses = load_process_and_filter_DD(file_path)
    DD_messages.extend(messages)
    DD_responses.extend(responses)

assert len(DD_messages) == len(DD_responses)
print(f'Total number of utterances from Daily Dialogues: {len(DD_messages)}\n')

for m, r in zip(DD_messages[:5], DD_responses[:5]):
    print(m, ' +++ ', r)

-------------------- Daily Dialogues --------------------
Loading from training file...
- number of utterances: 26731

Loading from test file...
- number of utterances: 2286

Loading from validation file...
- number of utterances: 2467

Total number of utterances from Daily Dialogues: 31484

<start> good . let s go now . <end>  +++  <start> all right . <end>
<start> really ? i think that s impossible ! <end>  +++  <start> you mean 30 push ups ? <end>
<start> you mean 30 push ups ? <end>  +++  <start> yeah ! <end>
<start> can you study with the radio on ? <end>  +++  <start> no , i listen to background music . <end>
<start> no , i listen to background music . <end>  +++  <start> what is the difference ? <end>


In [None]:
print('-------------------- ConvAI --------------------')
path_to_convai = os.path.join(path_to_datasets, 'ConvAI/train_full.json')

CA_messages, CA_responses = load_process_and_filter_CA(path_to_convai)

for m, r in zip(CA_messages[:5], CA_responses[:5]):
    print(m, ' +++ ', r)

-------------------- ConvAI --------------------
- number of conversations between human users: 404
- number of utterances: 3911

<start> hi <end>  +++  <start> hi <end>
<start> hi <end>  +++  <start> what do you think abouy it ? <end>
<start> what do you think abouy it ? <end>  +++  <start> about what ? <end>
<start> about what ? <end>  +++  <start> about text <end>
<start> what is pipa <end>  +++  <start> ? <end>


In [None]:
print('-------------------- Empathetic Dialogues --------------------')
train_path_ED = os.path.join(path_to_datasets, 'EmpatheticDialogues/train.csv')
test_path_ED = os.path.join(path_to_datasets, 'EmpatheticDialogues/test.csv')
valid_path_ED = os.path.join(path_to_datasets, 'EmpatheticDialogues/valid.csv')

file_paths_ED = [train_path_ED, test_path_ED, valid_path_ED]

ED_messages, ED_responses = [], []
for file_name, file_path in zip(file_names, file_paths_ED):
    print(f'Loading from {file_name}...')
    messages, responses = load_process_and_filter_ED(file_path)
    ED_messages.extend(messages)
    ED_responses.extend(responses)

assert len(ED_messages) == len(ED_responses)

print(f'Total number of utterances from Empathetic Dialogues: {len(ED_messages)}\n')

for m, r in zip(ED_messages[:5], ED_responses[:5]):
    print(m, ' +++ ', r)

-------------------- Empathetic Dialogues --------------------
Loading from training file...
- number of conversations: 19532
- number of utterances: 10163

Loading from test file...
- number of conversations: 2546
- number of utterances: 658

Loading from validation file...
- number of conversations: 2769
- number of utterances: 1012

Total number of utterances from Empathetic Dialogues: 11833

<start> this was a best friend . i miss her . <end>  +++  <start> where has she gone ? <end>
<start> where has she gone ? <end>  +++  <start> we no longer talk . <end>
<start> we no longer talk . <end>  +++  <start> oh was this something that happened because of an argument ? <end>
<start> oh ya ? i don t really see how <end>  +++  <start> dont you feel so . . its a wonder <end>
<start> i virtually thought so . . and i used to get sweatings <end>  +++  <start> wait what are sweatings <end>


In [None]:
print('-------------------- Persona Chat --------------------')
path_to_PC = os.path.join(path_to_datasets, 'Persona_Chat/personachat_original.json')

print('Loading Persona Chat data...')
PC_messages, PC_responses = load_process_and_filter_PC(path_to_PC)

for m, r in zip(PC_messages[:5], PC_responses[:5]):
    print(m, ' +++ ', r)

-------------------- Persona Chat --------------------
Loading Persona Chat data...
- number of conversations: 18878
- number of utterances: 83908

<start> wow , four sisters . just watching game of thrones . <end>  +++  <start> that is a good show i watch that while drinking iced tea <end>
<start> that is a good show i watch that while drinking iced tea <end>  +++  <start> i agree . what do you do for a living ? <end>
<start> i enjoy hanging with my mother she s my best friend <end>  +++  <start> that s nice . moms are pretty cool too . <end>
<start> hi ! i work as a gourmet cook . <end>  +++  <start> i don t like carrots . i throw them away . <end>
<start> i don t like carrots . i throw them away . <end>  +++  <start> really . but , i can sing pitch perfect . <end>


In [None]:
# print('-------------------- Holl-E --------------------')
# train_path_HE = os.path.join(path_to_datasets, 'Holl-E/train_data.json')
# test_path_HE = os.path.join(path_to_datasets, 'Holl-E/test_data.json')
# valid_path_HE = os.path.join(path_to_datasets, 'Holl-E/dev_data.json')

# file_paths_HE = [train_path_HE, test_path_HE, valid_path_HE]

# HE_messages, HE_responses = [], []
# for file_name, file_path in zip(file_names, file_paths_HE):
#     print(f'Loading from {file_name}...')
#     messages, responses = load_process_and_filter_HE(file_path)
#     HE_messages.extend(messages)
#     HE_responses.extend(responses)

# assert len(HE_messages) == len(HE_responses)

# print(f'Total number of utterances from Holl-E: {len(HE_messages)}\n')

# for m, r in zip(HE_messages[:5], HE_responses[:5]):
#     print(m, ' +++ ', r)


# print('-------------------- CCPE --------------------')
# path_to_ccpe = os.path.join(path_to_datasets, 'Coached Conversational Preference Elicitation (CCPE)/data.json')

# print('Loading CCPE data...')
# CCPE_messages, CCPE_responses = load_process_and_filter_CCPE(path_to_ccpe)

# for m, r in zip(CCPE_messages[:5], CCPE_responses[:5]):
#     print(m, ' +++ ', r)

In [None]:
print('-------------------- Cornell Movies Dataset --------------------')
cornell_folder = '/content/drive/My Drive/Colab Notebooks/Chatbots/version-1/cornell movie-dialogs corpus'

path_to_convs = os.path.join(cornell_folder, 'movie_conversations.txt')
path_to_lines = os.path.join(cornell_folder, 'movie_lines.txt')

cornell_messages, cornell_responses = create_conv_pairs(path_to_convs, path_to_lines)

-------------------- Cornell Movies Dataset --------------------
- number of utterances: 89290


In [None]:
all_messages = DD_messages + CA_messages + ED_messages + PC_messages + cornell_messages
all_responses = DD_responses + CA_responses + ED_responses + PC_responses + cornell_responses

assert len(all_messages) == len(all_responses)

print(f'Total number of utterances: {len(all_messages)}')

Total number of utterances: 220426


In [None]:
def save_tokenizer(tokenizer):
    file_name = os.path.join(root_path, 'tokenizer.json')
    tokenizer_json = tokenizer.to_json()
    with open(file_name, 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))


def save_utterances(messages, responses, vocab_size=VOCAB_SIZE):
    train_utters_path = os.path.join(path_to_datasets, 'train_utters.txt')
    eval_utters_path = os.path.join(path_to_datasets, 'test_utters.txt')
    
    all_utterances = set(messages + responses)
    tokenizer = tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, filters='')
    tokenizer.fit_on_texts(all_utterances)
    save_tokenizer(tokenizer)

    train_m, eval_m, train_r, eval_r = train_test_split(messages, responses,
                                                        test_size=10000, random_state=42)
    
    breaker = ' _+++_ '
    with open(train_utters_path, 'w') as f:
        for i in range(len(train_m)):
            f.write(train_m[i] + breaker + train_r[i] + '\n')

    with open(eval_utters_path, 'w') as f:
        for i in range(len(eval_m)):
            f.write(eval_m[i] + breaker + eval_r[i] + '\n')

    print(f'- number of training instances: {len(train_m)}')
    print(f'- number of evaluation instances: {len(eval_m)}')

In [None]:
save_utterances(all_messages, all_responses)

- number of training instances: 210426
- number of evaluation instances: 10000
