In [1]:
import os
import pickle
import time
import numpy as np
from pandas import read_csv, DataFrame, read_table
from random import shuffle
from nltk.tokenize import TweetTokenizer, word_tokenize
from numba import autojit, prange
from joblib import Parallel, delayed

### Files

In [2]:
CSV_PATH = "../Corpora/TO_CREATE_VALID_UTTERANCE"
EMOTIONS_PATH = CSV_PATH + "/emotions"
GOOGLE_NEWS_PATH = CSV_PATH + "/GOOGLE_1_billion_word/heldout"
df_maps = {}
i = 0

#### TXT

In [None]:
#Emotions
anger = read_table(os.path.join(TXT_PATH, "anger"), header=None, names=["anger"])
fear = read_table(os.path.join(TXT_PATH, "fear"), header=None, names=["fear"])
joy = read_table(os.path.join(TXT_PATH, "joy"), header=None, names=["joy"])
love = read_table(os.path.join(TXT_PATH, "love"), header=None, names=["love"])
sadness = read_table(os.path.join(TXT_PATH, "sadness"), header=None, names=["sadness"])
suprise = read_table(os.path.join(TXT_PATH, "surprise"), header=None, names=["surprise"])

In [None]:
#Emotions
for filename in os.listdir(EMOTIONS_PATH):
    df_maps[i] = read_table(os.path.join(EMOTIONS_PATH, filename), header=None)
    i += 1


In [3]:
#Google News
for filename in os.listdir(GOOGLE_NEWS_PATH):
    print(i, filename)
    df_maps[i] = read_table(os.path.join(GOOGLE_NEWS_PATH, filename), header=None, names="{}".format(i))
    i += 1

0 news.en.heldout-00038-of-00050
1 news.en.heldout-00035-of-00050
2 news.en.heldout-00043-of-00050
3 news.en.heldout-00042-of-00050
4 news.en.heldout-00034-of-00050
5 news.en.heldout-00028-of-00050
6 news.en.heldout-00048-of-00050
7 news.en.heldout-00023-of-00050
8 news.en.heldout-00037-of-00050
9 news.en.heldout-00017-of-00050
10 news.en.heldout-00009-of-00050
11 news.en.heldout-00015-of-00050
12 news.en.heldout-00040-of-00050
13 news.en.heldout-00033-of-00050
14 news.en.heldout-00030-of-00050
15 news.en.heldout-00016-of-00050
16 news.en.heldout-00026-of-00050
17 news.en.heldout-00021-of-00050
18 news.en.heldout-00006-of-00050
19 news.en.heldout-00018-of-00050
20 news.en.heldout-00008-of-00050
21 news.en.heldout-00039-of-00050
22 news.en.heldout-00046-of-00050
23 news.en.heldout-00010-of-00050
24 news.en.heldout-00036-of-00050
25 news.en.heldout-00031-of-00050
26 news.en.heldout-00003-of-00050
27 news.en.heldout-00047-of-00050
28 news.en.heldout-00014-of-00050
29 news.en.heldout-00049

#### CSV

In [None]:
utterance_flights = read_csv(os.path.join(CSV_PATH, "Utterance-Flights-f1197494.csv"))
sentence_pairs = read_csv(os.path.join(CSV_PATH, "1377882923_sentence_pairs.csv"))
headlines = read_csv(os.path.join(CSV_PATH, "examiner-date-text.csv"))
# movie_lines = read_csv(os.path.join(CSV_PATH, "movie_lines.csv"), delimiter="+")
# movie_quoats = read_csv(os.path.join(CSV_PATH, "moviequotes.scripts.csv"), delimiter="+")


In [None]:
chosen_utt = DataFrame(utterance_flights, columns=["response_1", "response_2", "response_3", "scenario"])
chosen_sent = DataFrame(sentence_pairs, columns=["sentenceA", "sentenceB"])
chosen_headlines = DataFrame(headlines, columns=["headline_text"])
# chosen_lines = DataFrame(movie_lines, columns=["They do not!"])
# chosen_quoats = DataFrame(movie_quoats, columns=["Ladies and gentlemen"])


### Preprocessing

In [None]:
pre_data = [
    anger,
    fear,
    joy,
    love,
    sadness,
    suprise,
    chosen_utt,
    chosen_sent,
    chosen_headlines
#     chosen_lines,
#     chosen_quoats
]
data = []

In [None]:
def preprocessing(df, column):
    df = df.dropna()
    df = df.applymap(lambda x: x.rstrip())
    df = df[df[column].str.len() > 6]
    df = df.applymap(lambda x: x.lstrip())
    df = df.applymap(lambda x: x[0].title() + x[1: ])
    df = df.applymap(lambda x: x + "." if x[-1] != "." else "")
    return df

In [None]:
utterances = []

def get_utteranes(df, utterances):
    for column in df.columns:
        utterances += list(df[column].unique())
    return utterances


In [None]:
data = []
for d in pre_data:
    pre = list(pre_data[d])
    data.append(preprocessing(pre, str(d)))

for df in data:
    utterances += get_utteranes(df, utterances)

In [None]:
len(utterances)

In [None]:
shuffle(utterances)

In [None]:
with open("data.pickle", "bw") as f:
    pickle.dump(utterances, f)

In [None]:
with open("data.pickle", "br") as f:
    utterances = pickle.load(f)

### Glossary

In [None]:
british_gloss = read_csv("british-english.csv", header=None)
usa_gloss = read_csv("american-english.csv", header=None)

In [None]:
british_gloss.columns = ["a"]
usa_gloss.columns = ["a"]
words = list(british_gloss.a.unique()) + list(",./';[]<>?:{}!@#$%^&*()_+-=") + list(usa_gloss.a.unique())
words = list(set(words))

### Utterances Filter

In [None]:
def get_valid_utt(data, gloss):
    valid_utt = []
    un_valid = []
    for utt in data:
        oov = False
        try:
            tknzr1 = word_tokenize(utt)
            for token in tknzr1:
                if token.lower() not in gloss:
                    oov = True
                    break
            if not oov:
                valid_utt.append(utt)
            else:
                un_valid.append(utt)
        except:
            print(utt)
#     return valid_utt, un_valid
    return valid_utt
            

### Results

In [None]:
ddd = [utterances[i * 50: (i + 1) * 50] for i in range(2 * len(utterances))]

In [None]:
start = time.time()
validd = Parallel(n_jobs=-1)(delayed(get_valid_utt)(utt, words) for utt in ddd)
stop = time.time()
print(stop - start)

In [None]:
valid = []
for el in validd:
    valid += el

In [None]:
un_valid

### Sava

In [None]:
DataFrame(valid).to_csv("valid_utterances.csv", index_label=False)