In [None]:
import os
import pickle
import time
import numpy as np
import itertools
from pandas import read_csv, DataFrame, read_table
from random import shuffle
from nltk.tokenize import TweetTokenizer, word_tokenize
from numba import autojit, prange
from joblib import Parallel, delayed
from nltk.tokenize import ToktokTokenizer
from collections import Counter
from tqdm import tqdm

### Generator Tools

In [None]:
def count_lenght_of_generator(generator):
    counter = 0
    new_generator = iter([])
    
    for el in generator:
        counter += 1
        new_generator = itertools.chain(new_generator, [el])
    print(counter)
    
    return new_generator

In [None]:
def count_unique_values_in_generator(generator):
    pass

### Files

In [None]:
CSV_PATH = "../Corpora/TO_CREATE_VALID_UTTERANCE"
EMOTIONS_PATH = CSV_PATH + "/emotions"
GOOGLE_NEWS_PATH = CSV_PATH + "/GOOGLE_1_billion_word/heldout"
df_gen = iter([])
i = 0

#### TXT

In [None]:
#Emotions
for filename in os.listdir(EMOTIONS_PATH):
    df = read_table(os.path.join(EMOTIONS_PATH, filename), header=None)
    df.columns = [str(i)]
    df_gen = itertools.chain(df_gen, [df])
    i += 1


In [None]:
#Google News
for filename in os.listdir(GOOGLE_NEWS_PATH):
    print(i, filename)
    df = read_table(os.path.join(GOOGLE_NEWS_PATH, filename), header=None)
    df.columns = [str(i)]
    df_gen = itertools.chain(df_gen, [df])
    i += 1
#     if i > 40:
#         break

#### CSV

In [None]:
utterance_flights = read_csv(os.path.join(CSV_PATH, "Utterance-Flights-f1197494.csv"))
sentence_pairs = read_csv(os.path.join(CSV_PATH, "1377882923_sentence_pairs.csv"))
headlines = read_csv(os.path.join(CSV_PATH, "examiner-date-text.csv"))
movie_lines = read_csv(os.path.join(CSV_PATH, "movie_lines.csv"), delimiter="+")
movie_quoats = read_csv(os.path.join(CSV_PATH, "moviequotes.scripts.csv"), delimiter="+")


In [None]:
chosen_utt = DataFrame(utterance_flights, columns=["response_1", "response_2", "response_3", "scenario"])
chosen_sent = DataFrame(sentence_pairs, columns=["sentenceA", "sentenceB"])
chosen_headlines = DataFrame(headlines, columns=["headline_text"])
chosen_lines = DataFrame(movie_lines, columns=["They do not!"])
chosen_quoats = DataFrame(movie_quoats, columns=["Ladies and gentlemen"])


### Preprocessing

In [None]:
def preprocessing(df, column):
    df = df.dropna()
    df = df[df[column].str.len() > 10]
    df = df.applymap(lambda x: x.lstrip())
    df = df.applymap(lambda x: x[0].title() + x[1: ])
    df = df.applymap(lambda x: x + "." if x[-1] not in ".?!" else x)
    return df


In [None]:
def get_utteranes(df, utterances):
    for column in df.columns:
        utterances = itertools.chain(utterances, list(df[column].unique()))
    return utterances


In [None]:
utterances = iter([])
data = Parallel(n_jobs=-1)(delayed(preprocessing)(df, str(i)) for i, df in enumerate(df_gen))

In [None]:
for df in data:
    utterances = itertools.chain(utterances, get_utteranes(df, utterances))

In [None]:
utterances = count_lenght_of_generator(utterances)
# unique_utterance = list(set(utterances))
# len(unique_utterance)
# shuffle(utterances)

In [None]:
utterances, utterances1 = itertools.tee(utterances)

In [None]:
with open("data.pickle", "bw") as f:
    pickle.dump(utterances, f)

In [None]:
with open("data.pickle", "br") as f:
    utterances = pickle.load(f)

### Glossary

#### problem z nazwami własnymi, one mogą być dowolne, ale mogę użyć NERa i zamienić je na xxxxx, następnie nimi się nie przejować z korekcie

In [None]:
british_gloss = read_csv("british-english.csv", header=None)
british_gloss.columns = ["word"]
usa_gloss = read_csv("american-english.csv", header=None)
usa_gloss.columns = ["word"]

In [None]:
words = list(british_gloss.word.unique()) + list(",./';[]<>?:{}!@#$%^&*()_+-=") + list(usa_gloss.word.unique()) + vocab
words = list(set(words))

In [None]:
# napisać program ktory wyrzuca slowa ktore nie powtarzają się w generatorze, o wiele wydajniejsze niż counter
def get_vocabulary(utterances):
    toktok = ToktokTokenizer()
    vocab = iter([])
    
    start = time.time()
    voc = Parallel(n_jobs=-1)(delayed(toktok.tokenize)(utt.lower()) for utt in utterances)
    stop = time.time()
    print((stop - start))
    
    start = time.time()
    for el in voc:
        vocab = itertools.chain(vocab, el)
    stop = time.time()
    print((stop - start))
    
    return vocab
    
#     counter = Counter(vocab)
#     vocab = (token for token in counter if counter[token] > 1)
#     rare_vocab = (token for token in counter if counter[token] == 1)
    
#     return vocab, rare_vocab
    

In [None]:
from functools import reduce
vocab = get_vocabulary(utterances)
chunks = get_chunks(vocab, 10000)
q = reduce(lambda x,y: iter(set(itertools.chain(x,y))), tqdm(chunks))

In [None]:
start = time.time()
vocab, rare_words = get_vocabulary(utterances)
stop = time.time()
print((stop - start))

In [None]:
%time vocab = count_lenght_of_generator(vocab)

In [None]:
vocab = list(q)

In [None]:
len(words)

### Utterances Filter

In [None]:
def get_valid_utt(chunk, vocab):
    valid_utt = iter([])
    un_valid = iter([])
    toktok = ToktokTokenizer()
    for utt in chunk:
        oov = False
        try:
#             tknzr = word_tokenize(utt)
            tknzr = toktok.tokenize(utt)
            for token in tknzr:
                if token.lower() not in vocab:
                    oov = True
                    break
            if not oov:
                valid_utt = itertools.chain(valid_utt, [utt])
            else:
                un_valid = itertools.chain(un_valid, [utt])
        except:
            print(utt)
    return (valid_utt, un_valid)
            

### Results

In [None]:
def get_chunks(generator, chunk_size):
    counter = 1
    chunk = iter([])
    for el in generator:
        if counter > chunk_size:
            yield chunk
            chunk = iter([])
            counter = 1

        chunk = itertools.chain(chunk, [el])
        counter += 1
        
    yield chunk
        

In [None]:
chunks = get_chunks(utterances, 100)

In [None]:
start = time.time()
pair = Parallel(n_jobs=-1)(delayed(get_valid_utt)(chunk, words) for chunk in chunks)
stop = time.time()
print((stop - start))

In [None]:
pair, pair1 = itertools.tee(pair)

In [None]:
validd, _ = zip(*pair1)

In [None]:
valid = iter([])
for el in validd:
    valid = itertools.chain(valid, el)

In [None]:
valid = count_lenght_of_generator(valid)

### Sava

In [None]:
DataFrame(list(utterances)).to_csv("valid_utterances_all.csv", index_label=False)

In [None]:
#ogarnij słowa z google news, wyrzuć te które pojawiają się tylko raz
# jak efektywnie wgać generatory do DataFrame?

In [None]:
import pandas as pd

In [None]:
pd.concat(utterances1)

In [None]:
## TODO
