In [1]:
import re
import csv
import random
from os import path, getcwd

import nltk
from nltk.corpus import nps_chat
from collections import defaultdict

In [2]:
# The corpus needs to be downloaded once
nltk.download('nps_chat')

[nltk_data] Downloading package nps_chat to
[nltk_data]     /Users/lzfelix/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!


True

In [3]:
# DATASET_FILES = path.join(getcwd(), './nps_chat/*.xml')
# This code uses the dataset that can be obtained through NLTK

TRAIN_PATH = path.join(getcwd(), './clean/nps_train.tsv')
DEV_PATH = path.join(getcwd(), './clean/nps_dev.tsv')
TEST_PATH = path.join(getcwd(), './clean/nps_test.tsv')

TRAIN_SET = ['10-19-adults_706posts.xml', '11-09-20s_706posts.xml', '11-08-20s_705posts.xml', '10-19-40s_686posts.xml', '11-09-40s_706posts.xml', '10-24-40s_706posts.xml', '11-09-teens_706posts.xml', '11-08-40s_706posts.xml', '10-19-30s_705posts.xml', '11-08-adults_705posts.xml', '11-06-adults_706posts.xml']
DEV_SET   = ['10-26-teens_706posts.xml', '11-09-adults_706posts.xml']
TEST_SET  = ['10-19-20s_706posts.xml', '11-08-teens_706posts.xml']

In [4]:
class Preprocessor:
    TOKEN_EMOJI = '[emj]'
    TOKEN_USERNAME = '[usr]'
    TOKEN_NUMBER = '[num]'
    
    TOKEN_EMOJI = 'EMOJI_WORD'
    TOKEN_USERNAME = 'user'
    TOKEN_NUMBER = ''
    TOKEN_EMPTY = 'XXX'
    
    def __init__(self, room_name, replace_emoji=False):
        self.username_regex = re.compile(r"(\d{1,2}\-){2}" + room_name + "(User\d+)")
        self.emoji_regex = re.compile(r'(>?[:;=\+]-?[P\)\(@\*o>]|<3|o_0|0o|o0|>_>|o_o)')
        self.url_regex = re.compile(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')
        self.replace_emoji = replace_emoji

    def preprocess(self, text):
        """Removes ellipses regardless of their size and username mentions."""
        
        text = re.sub(self.url_regex, 'url', text)
        
        # Replaces username mention and emojis by tokens
        text = re.sub(self.emoji_regex, self.TOKEN_EMOJI, text)

        text = re.sub(self.username_regex, self.TOKEN_USERNAME, text)
        
        # It seems that there's a bug on the dataset. During the anonymization
        # phase sentences such as 'USER ACTION ...' had the user part replaced
        # by '.'
        text = re.sub('^.ACTION', self.TOKEN_USERNAME + ' ACTION', text)
        
        # Removes non-alphanumerics, usual punctuation and ellipsis
        text = re.sub(r'[\[\]\'!\^\\\.\(\)\*\/%,\-"#@]', ' ', text)
        text = re.sub(r'\.{3}', ' ', text)
        text = re.sub(r'!{2,}', '!', text)
        text = re.sub(r'\?{2,}', '?', text)
        
        # If replacing by number before, the tokenizer will create a new word for it
        text = re.sub('\d+', self.TOKEN_NUMBER, text)
        
        # Removing redundant spaces and lowercasing
        text = re.sub(r'\s{2,}', ' ', text)
        
        # adding the emoji back, if this is the case
        if not self.replace_emoji:
            text = re.sub(self.TOKEN_EMOJI, ':)', text)

        text = re.sub(r'_+', '', text)
        text = text.lower().strip()
        if len(text) == 0:
            text = self.TOKEN_EMPTY
            
        if '_' in text:
            print(text)

        return text

In [5]:
def load_split_data(split_files):
    samples = list()
    
    for file_id in split_files:
        common_name = file_id.split('.')[0]
        
        # all usernames on this chatroom will contain this substring. This is
        # used to replace all mentions to a specific token
        username_substring = file_id.split('-')[2].split('_')[0]
        pre = Preprocessor(username_substring)
        
        for i, utterance in enumerate(nps_chat.xml_posts(file_id)):
            text = utterance.text
            
            samples.append({
                'id': '{}_{}'.format(file_id, str(i+1)),
                'label': utterance.attrib['class'],
                'text':  text,
                'clean': pre.preprocess(text)
            })
            
    return samples


train_set = load_split_data(TRAIN_SET)
dev_set = load_split_data(DEV_SET)
test_set = load_split_data(TEST_SET)
n_conversations = len(train_set) + len(dev_set) + len(test_set)

In [6]:
print('Split summary:')
print(' - Train: {}\t{:.4}%'.format(len(train_set), len(train_set) / n_conversations * 100))
print(' - Dev:   {}\t{:.4}%'.format(len(dev_set), len(dev_set) / n_conversations * 100))
print(' - Test:  {}\t{:.4}%'.format(len(test_set), len(test_set) / n_conversations * 100))
print(' - Total: {}'.format(n_conversations))

Split summary:
 - Train: 7743	73.28%
 - Dev:   1412	13.36%
 - Test:  1412	13.36%
 - Total: 10567


In [7]:
# Persisting data

def write_partition_to_disk(dataset, destination_file):
    """Creates the train/dev/test file with the utterances from all dialogues in <dataset>."""

    with open(destination_file, 'w') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(['id', 'label', 'text', 'clean'])
        
        for sample in dataset:
            writer.writerow([sample['id'], sample['label'], sample['text'], sample['clean']])


write_partition_to_disk(train_set, TRAIN_PATH)
write_partition_to_disk(dev_set, DEV_PATH)
write_partition_to_disk(test_set, TEST_PATH)


In [8]:
# Data peeking

AMOUNT_PEEKS = 50

sampled_utterances = [random.randint(0, len(train_set) - 1) for _ in range(AMOUNT_PEEKS)]

print('{:10.10}\t{:40.40}'.format('-label-', '-clean text (40 chars)-'))
for i in sampled_utterances:
    
    label = train_set[i]['label'] 
    text = train_set[i]['clean'] 
    
    print('{:12.12}\t{:90.90}\t'.format(label, text))
    

-label-   	-clean text (40 chars)-                 
Greet       	hello user                                                                                	
Statement   	look at us babi look at us now                                                            	
System      	join                                                                                      	
System      	join                                                                                      	
Emotion     	lol                                                                                       	
Greet       	user                                                                                      	
Statement   	nevermind                                                                                 	
Other       	ntmn                                                                                      	
System      	join                                                                                      	
Sta