# Text Preprocessing

In [42]:
#############
# Import libs
#############

import numpy as np
import re
from matplotlib import pyplot as plt
from collections import Counter
from nltk.tokenize import casual_tokenize

In [67]:
###########
# Settings & constants
###########

en_text_sample = "data/test.txt"
ru_text_sample = "data/rutest.txt"

In [74]:
"""Preprocess routine from text"""

# Preprocess:
# read -> decode - > decapitalize -> strip service symbols
# tokenize (word, char)
# create vocab


def read_text(filepath):
    
    with open(filepath, 'rb') as fp:
        lines = []
        for line in fp:
            line = line.strip().lower().decode("ascii", "ignore")
            if len(line) == 0:
                continue
            lines.append(line)
    text = " ".join(lines)
    return text


def get_tokens(text):    
    return set(text.split())


def get_dicts(words):
    word2index = {w: i for i, w in enumerate(words)}
    index2word = {i: w for i, w in enumerate(words)}    
    return word2index, index2word
    

def process_tokens(text):
    return Counter(text.split())
    

def preprocess(filepath):
    text = read_text(filepath)
    tokens = get_tokens(text)
    word2index, index2word = get_dicts(tokens)
    countered_tokens = process_tokens(text)
    return word2index, index2word, countered_tokens


def n_gram_tokenize(text):
    pass


def clear_punctuation(text):
    pass




In [75]:
text = read_text(en_text_sample) # Deploy text

In [76]:
word2index, index2word, bag_of_words = preprocess(en_text_sample)

In [78]:
{k:index2word[k] for k in np.arange(1,10) if k in index2word}

{1: 'waited',
 2: 'dinah.',
 3: 'hurting',
 4: 'ears',
 5: "cry!'",
 6: 'man',
 7: 'sadly,',
 8: 'understand',
 9: 'cake?'}

In [24]:
def one_hot_vectorizer(sequence):
    
    """One-hot vectorizer"""
    
    token_sequence = str.split(sequence)
    vocab = sorted(set(token_sequence))
    vocab_size = len(vocab)
    num_tokens = len(token_sequence)
    onehot_vectors = np.zeros((num_tokens, vocab_size))
    
    for i, word in enumerate(token_sequence):
        onehot_vectors[i, vocab.index(word)] = 1
    
    print(f"Perform one-hot vectorize...\n processed tokens: {num_tokens}\n vocabulary size: {vocab_size}")
    return onehot_vectors

In [33]:
wv = one_hot_vectorizer(text)
wv

Perform one-hot vectorize...
 processed tokens: 7690
 vocabulary size: 1454


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [89]:
simple_tokens = get_tokens(text)
nltk_tokens = casual_tokenize(text)

print(f"Simple tokens:\n {list(simple_tokens)[-10:]}\n\nnltk_tokens:\n {nltk_tokens[-10:]}")

Simple tokens:
 ["'they're", 'hands', 'finish', 'site', 'father', 'schoolbook,', 'than', "any,'", "understand,'", "know?'"]

nltk_tokens:
 ['alice', '.', 'and', 'she', 'got', 'up', 'and', 'ran', 'home', '.']


### Regex phrase processors :

In [None]:
def preprocess_string(string):
    res = string.lower().decode("ascii", "ignore") 
    return res


def re_greet_detector(string):
    r = r"[^a-z]*([y]o|[h']?ello|ok|hey|(good[ ])?(morn[gin']{0,3}}"\
             r"afternoon|even[gin']{0,3}))[\s,;:]{1,3}([a-z]{1,20})"
    res = re.compile(r, flags=re.IGNORECASE)
    res = res.match(string)
    return res