In [2]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296181 sha256=3d1b9612e1a30bc281de49d211052cab94d81a8a2d24a12592e36e0163cf09ef
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [3]:
import os
import zipfile
import requests
import re
import itertools
import random
import statistics
import time
import string
from collections import defaultdict
from functools import reduce

import numpy as np
import fasttext
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.sonority_sequencing import SyllableTokenizer
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

In [4]:
def get_swda(py_url, zip_url, subdir):
    # Ensure `swda.py` exists
    py_filename = "swda.py"
    if not os.path.exists(py_filename):
        print(f'{py_filename} not found. Downloading from {py_url}...')
        response = requests.get(py_url)
        with open(py_filename, 'wb') as f:
            f.write(response.content)

    # Ensure the zip exists. If 'swda/swda-metadata.csv' does not exist, then
    # this indicates the zip has not been extracted, so unzip it now:
    if not os.path.exists(os.path.join(subdir, 'swda-metadata.csv')):
        if not os.path.exists(subdir):
            os.makedirs(subdir)
        zip_filename = "swda.zip"
        if not os.path.exists(zip_filename):
            print(f'{zip_filename} not found. Downloading from {zip_url}...')
            response = requests.get(zip_url)
            with open(zip_filename, 'wb') as f:
                f.write(response.content)

        # Extract zip file into subdir
        print(f'Extracting {zip_filename} into {subdir}...')
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall()

def setup():
    data_dir = "swda"

    # Download `swda.py` and `swda.zip` if necessary
    py_url = "https://github.com/cgpotts/swda/raw/master/swda.py"
    zip_url = "https://github.com/cgpotts/swda/raw/master/swda.zip"
    get_swda(py_url, zip_url, data_dir)
    nltk.download('averaged_perceptron_tagger_eng')
    nltk.download('wordnet')
    try:
      nltk.data.find('tokenizers/punkt_tab')
    except LookupError:
      nltk.download('punkt_tab')

In [5]:
setup()

swda.py not found. Downloading from https://github.com/cgpotts/swda/raw/master/swda.py...
swda.zip not found. Downloading from https://github.com/cgpotts/swda/raw/master/swda.zip...
Extracting swda.zip into swda...


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [6]:
regex_tokenizer = RegexpTokenizer(r'\w+')
syllable_tokenizer = SyllableTokenizer()
lemmatizer = WordNetLemmatizer()

DATA_DIR = "swda"
COUNT = 0
TRAIN_MALE_ENTRIES = 0
TRAIN_FEMALE_ENTRIES = 0
TRAIN_DISCARDED_ENTRIES = 0
VAL_MALE_ENTRIES = 0
VAL_FEMALE_ENTRIES = 0
VAL_DISCARDED_ENTRIES = 0

wordcounts = defaultdict(float)
male_wordcounts = defaultdict(float)
female_wordcounts = defaultdict(float)

DIVISIVE_WORDS = ['husband', 'wonderful', 'wear', 'dress', 'wife', 'huhuh', 'goodness',
                  'dinner', 'tax', 'cook', 'woman', 'gosh', 'color', 'girl', 'god',
                  'neat', 'mother', 'oh', 'vacation', 'flower', 'told', 'men', 'fish',
                  'ours', 'university', 'love',]

def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to NOUN if unknown

def get_word_lemma_counts(sent):
    out = defaultdict(float)

    # Get rid of all annotations, make lowercase, and nuke punctuation
    sent = purge_enclosed(sent)
    sent = " ".join(clean_square_brackets(sent).split())
    sent = sent.lower()
    sent = sent.translate(str.maketrans('', '', string.punctuation))

    words = nltk.word_tokenize(sent)

    for word in words:
        pos = get_wordnet_pos(word)
        lemma = lemmatizer.lemmatize(word, pos=pos)
        out[f'{lemma}'] += 1

    return out


def clean_square_brackets(text):
    stack = []
    result = ""
    i = 0

    while i < len(text):
        if text[i] == "[":
            # Start of a new bracketed phrase, push current result to stack
            stack.append(result)
            result = ""
            i += 1
        elif text[i] == "]":
            # End of a bracketed phrase, pop from stack
            if stack:
                before_bracket = stack.pop()
                # Split the current result by '+', and take the part after '+'
                parts = result.split("+")
                if len(parts) > 1:
                    result = before_bracket + parts[1].strip()
                else:
                    result = before_bracket + result
            i += 1
        else:
            # Accumulate characters in the current result
            result += text[i]
            i += 1

    return result.strip()

def purge_enclosed(text):
    stack = []
    result = []
    braces = {"(": ")", "{": "}", "<": ">"}
    open_braces = set(braces.keys())
    close_braces = set(braces.values())

    for char in text:
        if char in open_braces:
            # Start of a new enclosure, push to stack
            stack.append(char)
        elif char in close_braces:
            # End of an enclosure, pop from stack
            if stack and braces[stack[-1]] == char:
                stack.pop()
        elif not stack:
            # Only add to result if we're not inside any braces
            result.append(char)

    return ''.join(result).strip()

def side_to_sentences(text):
    # Convert tuple into paragraph
    text = ''.join(text)

    # Get sentences
    sentences = nltk.tokenize.sent_tokenize(text)
    # nltk doesn't recognize "/" as a sentence divider, so divide it manually
    split_at_slashes = []
    for sent in sentences:
        split_at_slashes.extend(sent.split("/"))

    result = ""
    # Go through and destroy anything too short, and clean up the others
    for sent in split_at_slashes:
        if len(sent) <= 8:
            continue
        sent = purge_enclosed(sent)
        sent = " ".join(clean_square_brackets(sent).split())
        sent = sent.lower()
        result += sent + " "

    sent = sent.strip('#')

    return result, len(split_at_slashes)

MAIN FASTTEXT FEATURES ARE DEFINED HERE

In [7]:
def format_partial_conversation(features, tuple_of_sentences):
    sentences, num_sentences = side_to_sentences(tuple_of_sentences)

    tokens = nltk.word_tokenize(sentences)
    # nontrivial_tokens = [token for token in tokens if len(token) > 1]

    lemmata = []

    for word in tokens:
        pos = get_wordnet_pos(word)
        lemmata.append(lemmatizer.lemmatize(word, pos=pos))

    ## Very effective: 55.14%
    for word in DIVISIVE_WORDS:
        if word in lemmata:
            features[f'has_{word}'] = "True"
        else:
            features[f'has_{word}'] = "False"

    ## Not very effective: 50.26%
    features['interrupted'] = "-" in sentences



    ## Not very effective: 50.27%
    # features['mentions_children'] = reduce(lambda a, b: a or b,
    #                            [word in lemmata for word in ["child",
    #                                                       "kid",
    #                                                       "son",
    #                                                       "daughter",
    #                                                       "baby",
    #                                                       ]])

    ## No effect
    # avg_sentence_length = len(tokens) / num_sentences
    # if avg_sentence_length < 7:
    #     features['avg_length'] = "short"
    # elif avg_sentence_length < 14:
    #     features['avg_length'] = "medium"
    # else:
    #     features['avg_length'] = "long"


    ### Pronoun features: no effect on accuracy
    # # Count pronouns, loop is faster than .count here
    # num_prons, fp_prons, sp_prons = 0,0,0
    # male_tp_prons, female_tp_prons, neutral_tp_prons = 0,0,0
    # for word in tokens:
    #     if word == 'i':
    #         fp_prons += 1
    #         num_prons += 1
    #     elif word == 'you':
    #         sp_prons += 1
    #         num_prons += 1
    #     elif word in ['he', 'him', 'his']:
    #         male_tp_prons += 1
    #         num_prons += 1
    #     elif word in ['she', 'her', 'hers']:
    #         female_tp_prons += 1
    #         num_prons += 1
    #     elif word in ['they', 'them', 'theirs']:
    #         neutral_tp_prons += 1
    #         num_prons += 1

    # tp_prons = male_tp_prons + female_tp_prons + neutral_tp_prons

    # features['primary_fp_vs_sp_prons'] = "na"
    # features['primary_person_prons'] = "na"
    # features['primary_gender_prons'] = "na"

    ## No effect on accuracy
    # if fp_prons > sp_prons:
    #     features['primary_fp_vs_sp_prons'] = "fp"
    #     if fp_prons > tp_prons:
    #         features['primary_person_prons'] = "fp"
    #     elif tp_prons > fp_prons:
    #         features['primary_person_prons'] = "tp"
    # elif sp_prons > fp_prons:
    #     features['primary_fp_vs_sp_prons'] = "sp"
    #     if sp_prons > tp_prons:
    #         features['primary_person_prons'] = "sp"
    #     elif tp_prons > sp_prons:
    #         features['primary_person_prons'] = "tp"

    ## Zero effect on accuracy
    # if neutral_tp_prons > male_tp_prons:
    #     if neutral_tp_prons > female_tp_prons:
    #         features['primary_gender_prons'] = "neutral"
    #     elif female_tp_prons > neutral_tp_prons:
    #         features['primary_gender_prons'] = "female"
    # elif male_tp_prons > neutral_tp_prons:
    #     if male_tp_prons > female_tp_prons:
    #         features['primary_gender_prons'] = "male"
    #     elif female_tp_prons > male_tp_prons:
    #         features['primary_gender_prons'] = "female"

    ## Analyze word lengths slightly more than naively
    # word_lens = list(map(len, nontrivial_tokens))

    ## This should account for skewed data where someone uses lots of small words
    # if len(word_lens) > 0:
    #     avg_word_length = statistics.mean(word_lens) / num_sentences
    # else:
    #     return None

    ## Zero change to accuracy
    # if avg_word_length >= 6:
    #     features['avg_word_length'] = "long"
    # elif avg_word_length >= 3:
    #     features['avg_word_length'] = "medium"
    # else:
    #     features['avg_word_length'] = "short"


    ## Zero change to accuracy
    ## This should be more representative of medium-length words
    # nontrivial_word_lens = [n / num_sentences for n in word_lens if n > 3]
    # if len(nontrivial_word_lens) > 0:
    #     median_nontrivial_word_length = statistics.median(nontrivial_word_lens)
    # else:
    #     median_nontrivial_word_length = 0

    # if median_nontrivial_word_length >= 7:
    #     features['median_nontrivial_word_length'] = "long"
    # elif median_nontrivial_word_length >= 5:
    #     features['median_nontrivial_word_length'] = "medium"
    # else:
    #     features['median_nontrivial_word_length'] = "short"


    ## about ~16% of english words are 8 or longer characters

    # features['has_long_word_by_chars'] = "False"
    # for n in nontrivial_word_lens:
    #     if n >= 8:
    #         features['has_long_word_by_chars'] = "True"
    #         break

    # Analyze syllables, turn each word into the number of syllables it contains
    # This has some small issues with apostrophes, since we split on them before.
    syllable_counts = [len(syllable_tokenizer.tokenize(word)) for word in tokens]

    # avg_sent_syllables = sum(syllable_counts) / num_sentences

    # No change to accuracy
    # if avg_sent_syllables > 30:
    #     features['avg_sent_syllables'] = "many"
    # elif avg_sent_syllables > 15:
    #     features['avg_sent_syllables'] = "medium"
    # else:
    #     features['avg_sent_syllables'] = "few"

    # if len(syllable_counts) > 0:
    #     avg_num_syllables = statistics.mean(syllable_counts) / num_sentences
    # else:
    #     return None

    # nontrivial_syllable_counts = [n for n in syllable_counts if n > 1]

    # if len(nontrivial_syllable_counts) > 0:
    #     median_nontrivial_syllables = statistics.median(nontrivial_syllable_counts)
    # else:
    #     median_nontrivial_syllables = 0

    # if median_nontrivial_syllables > 6:
    #     features['median_nontrivial_syllables'] = "many"
    # elif median_nontrivial_syllables >= 3:
    #     features['median_nontrivial_syllables'] = "medium"
    # else:
    #     features['median_nontrivial_syllables'] = "few"

    # features['has_3_syllable_or_longer'] = "False"
    # features['has_4_syllable_or_longer'] = "False"
    # features['has_5_syllable_or_longer'] = "False"

    for n in syllable_counts:
        if n >= 5:
            features['has_3_syllable_or_longer'] = "True"
            features['has_4_syllable_or_longer'] = "True"
            features['has_5_syllable_or_longer'] = "True"
            break
        if n >= 4:
            features['has_3_syllable_or_longer'] = "True"
            features['has_4_syllable_or_longer'] = "True"
        if n >= 3:
            features['has_3_syllable_or_longer'] = "True"

    # Create the formatted FastText line
    formatted = f"__label__{features['sex']} "
    for feature in features.keys():
        if feature == 'sex':    # Already did this one as __label__ for FastText
            continue
        formatted += f"{feature}:{str(features[feature])} "
    formatted += f'partial_conversation:"{tuple_of_sentences}"'
    return formatted

In [8]:
def add_partial_conversations(output_file, transcript, mode):
    metadata = transcript.metadata
    idx = transcript.conversation_no

    for caller in ["A", "B"]:
        features = defaultdict(str)
        features['sex'] = (metadata[idx]["from_caller_sex"] if caller == "A"
                            else metadata[idx]["to_caller_sex"])

        # Get all sentences spoken by caller A
        interrupts = False
        side = []
        utt_features_to_check = [
            # 'bh', # no effect
            'ba', # huge effect: 56.35% accuracy
            # '^g', # no effect
            # 'fa', # no effect
            # 'ft', # no effect
            'ar', # slight effect: 50.50%
            'nn', # slight effect: 50.82% accuracy
            # 'ng', # no effect
            # 'bf', # no effect
            ]
        for utt in transcript.utterances:
            if utt.caller == caller:
                side.append(utt)
                for feat in utt_features_to_check:
                    if utt.act_tag == feat:
                        features[feat] = True
            elif not interrupts:
                if "-" in utt.text:
                    interrupts = True

        for feat in utt_features_to_check:
            if not features[feat]:
                features[feat] = False

        GRAM_LENGTH = 50
        global COUNT

        # Use itertools.combinations to get every possible combination of 10
        # sentences spoken by caller A
        sentence_ngrams = []
        utterance_strings = [utt.text for utt in side]
        if len(side) < GRAM_LENGTH:
            COUNT += 1

            sentence_ngrams = [utterance_strings]
        else:
            sentence_ngrams = [random.sample(utterance_strings, GRAM_LENGTH) for _ in range(int(3 * (len(side) // GRAM_LENGTH)))]

        # Pre-count total number of each sex's entry
        if mode == 'TRAIN':
            global TRAIN_MALE_ENTRIES
            global TRAIN_FEMALE_ENTRIES
            global TRAIN_DISCARDED_ENTRIES
            for sentence_ngram in sentence_ngrams:
                if features['sex'] == 'FEMALE':
                    TRAIN_FEMALE_ENTRIES += 1
                elif features['sex'] == 'MALE':
                    TRAIN_MALE_ENTRIES += 1
        elif mode == 'VAL':
            global VAL_MALE_ENTRIES
            global VAL_FEMALE_ENTRIES
            global VAL_DISCARDED_ENTRIES
            for sentence_ngram in sentence_ngrams:
                if features['sex'] == 'FEMALE':
                    VAL_FEMALE_ENTRIES += 1
                elif features['sex'] == 'MALE':
                    VAL_MALE_ENTRIES += 1


        # Manually calculated: 1 - F/M, where F and M are the total numbers of
        # sentence ngrams, respectively
        TRAIN_MOST_COMMON_SEX = 'FEMALE'
        VAL_MOST_COMMON_SEX = 'FEMALE'
        TRAIN_DISCARD_PROBABILITY = 1 - (10910.0 / 14310.0)
        VAL_DISCARD_PROBABILITY = 1 - (4685 / 6987)

        features['interrupts'] = str(interrupts)

        # Process every combination of GRAM_LENGTH sentences
        for sentence_ngram in sentence_ngrams:
            sentence_lemmas = get_word_lemma_counts(sentence_ngram)
            for lemma, count in sentence_lemmas.items():
                wordcounts[lemma] += count
                if features["sex"] == 'MALE':
                    male_wordcounts[lemma] += count
                elif features["sex"] == 'FEMALE':
                    female_wordcounts[lemma] += count
            # Discard all excess female entries to balance training data
            if mode == "TRAIN" and features["sex"] == TRAIN_MOST_COMMON_SEX:
                if np.random.random() < TRAIN_DISCARD_PROBABILITY:
                    TRAIN_DISCARDED_ENTRIES += 1
                    continue
            if mode == "VAL" and features["sex"] == VAL_MOST_COMMON_SEX:
                if np.random.random() < VAL_DISCARD_PROBABILITY:
                    VAL_DISCARDED_ENTRIES += 1
                    continue


            formatted = format_partial_conversation(features, sentence_ngram)
            if formatted:
                output_file.write(formatted + "\n")

def make_fasttext(subdirs, output_file, Transcript, mode):
    with open(output_file, 'w') as outfile:
        # Step 1: Process each specified subdirectory
        for subdir_path in subdirs:
            if not os.path.isdir(subdir_path):
                print(f"Skipping invalid directory {subdir_path}")
                continue

            # Step 2: Iterate over .utt.csv files
            print(f"Processing files in {subdir_path}...")
            for filename in os.listdir(subdir_path):
                if filename.endswith(".utt.csv"):
                    filepath = os.path.join(subdir_path, filename)

                    # Step 3: Create a Transcript object
                    try:
                        metadata_file = os.path.join(DATA_DIR, "swda-metadata.csv")
                        transcript = Transcript(filepath, metadata_file)
                    except Exception as e:
                        print(f"Error processing file {filepath}: {e}")
                        continue

                    add_partial_conversations(outfile, transcript, mode = mode)

    print(f"Saved sentences and conversation sides to {output_file}")

def validate(model, VALIDATION_DIRS, Transcript):
    # Preprocess the validation data
    validation_ft = "validation.ft"
    VALIDATION_DIRS = [os.path.join(DATA_DIR, filename) for filename in VALIDATION_DIRS]
    print(f"VALIDATION_DIRS: {VALIDATION_DIRS}")
    make_fasttext(VALIDATION_DIRS, validation_ft, Transcript, mode = "VAL")

    # Measure performance on validation set
    validation_performance = model.test('validation.ft')
    print(f"Performance on validation set "
          f"({validation_performance[0]} entries): "
          f"\t{validation_performance[1]*100:.2f}% accuracy")

def test(model, TEST_DIRS, Transcript):
    # Preprocess the test data
    test_ft = "test.ft"
    TEST_DIRS = [os.path.join(DATA_DIR, filename) for filename in TEST_DIRS]
    print(f"TEST_DIRS: {TEST_DIRS}")
    make_fasttext(TEST_DIRS, test_ft, Transcript, mode = "TEST")

    # Measure performance on test set
    test_performance = model.test('test.ft')
    print(f"Performance on test set "
          f"({test_performance[0]} entries): "
          f"\t{test_performance[1]*100:.2f}% accuracy")

def train():
    start = time.time()

    TRAIN_DIRS = []
    VALIDATION_DIRS = []
    TEST_DIRS = []
    for i in range(0, 13):
        filename = f"sw{i:02}utt"
        if i < 7:#< 7:
            TRAIN_DIRS.append(filename)
        elif i < 10:#< 10:
            VALIDATION_DIRS.append(filename)
        else:
            TEST_DIRS.append(filename)

    # The import needs to be in this function, not in the root namespace,
    # because `setup` needs to import train() without knowing what `Transcript`
    # is
    from swda import Transcript

    # Preprocess the training data: Combine files in TRAIN_DIRS into one big
    # training file in FastText format
    train_ft = "train.ft"
    TRAIN_DIRS = [os.path.join(DATA_DIR, filename) for filename in TRAIN_DIRS]
    print(f"TRAIN_DIRS: {TRAIN_DIRS}")
    make_fasttext(TRAIN_DIRS, train_ft, Transcript, mode = "TRAIN")

    print(f"{TRAIN_MALE_ENTRIES} male partial conversations in training")
    print(f"{TRAIN_FEMALE_ENTRIES} female partial conversations in training")
    print(f"{TRAIN_DISCARDED_ENTRIES} discarded partial conversations in training")

    word_disparities = defaultdict(float)

    print("Total words: " + str(sum(wordcounts.values())))
    print("Unique words: " + str(len(wordcounts.keys())))

    words_filtered = 0

    for lemma, count in wordcounts.items():
        # 72 is about 0.01% of the corpus
        if count > 72:
            disparity = abs((male_wordcounts[lemma]/.545)
                            - (female_wordcounts[lemma]/.455)) / count
            word_disparities[lemma] = disparity
        else:
            words_filtered += 1

    print("Words remaining: " + str(len(word_disparities.keys())))

    print("Words filtered: " + str(words_filtered))

    top_divisive_words = dict(sorted(word_disparities.items(),
                         key=lambda x: x[1],
                         reverse=True)[:30]).keys()
    print("Most divisive words: " + str(top_divisive_words))


    # Train and test the model on training set
    model = fasttext.train_supervised(train_ft,
                                      epoch=35,
                                      )
    train_performance = model.test('train.ft')
    print(f"Performance on train set "
          f"({train_performance[0]} entries): "
          f"\t{train_performance[1]*100:.2f}% accuracy")

    model.save_model("model.bin")

    trained = time.time()
    print(f"Finished training in {trained - start:.2f} seconds")

    print(COUNT)

    validate(model, VALIDATION_DIRS, Transcript)

    print(COUNT)

    print(f"{VAL_MALE_ENTRIES} male partial conversations in validation")
    print(f"{VAL_FEMALE_ENTRIES} female partial conversations in validation")
    print(f"{VAL_DISCARDED_ENTRIES} discarded partial conversations in validation")

    # No peeking
    test(model, TEST_DIRS, Transcript)

    end = time.time()
    print(f"Finished overall in {end - start:.2f} seconds")

In [9]:
train()

TRAIN_DIRS: ['swda/sw00utt', 'swda/sw01utt', 'swda/sw02utt', 'swda/sw03utt', 'swda/sw04utt', 'swda/sw05utt', 'swda/sw06utt']
Processing files in swda/sw00utt...
Processing files in swda/sw01utt...
Processing files in swda/sw02utt...
Processing files in swda/sw03utt...
Processing files in swda/sw04utt...
Processing files in swda/sw05utt...
Processing files in swda/sw06utt...
Saved sentences and conversation sides to train.ft
2499 male partial conversations in training
3377 female partial conversations in training
787 discarded partial conversations in training
Total words: 1688207.0
Unique words: 16588
Words remaining: 1194
Words filtered: 15394
Most divisive words: dict_keys(['husband', 'recipe', 'sauce', 'wear', 'dinner', 'jean', 'salad', 'cooking', 'wonderful', 'brick', 'goodness', 'church', 'ooh', 'busy', 'girl', 'clothes', 'heat', 'dress', 'shell', 'painting', 'plano', 'scary', 'mother', 'woman', 'huhuh', 'dish', 'baby', 'stayed', 'daughter', 'fish'])
Performance on train set (5089