# Text Preprocessing

In [90]:
#############
# Import libs
#############

import numpy as np
import re
from matplotlib import pyplot as plt
from collections import Counter
import nltk
from nltk.tokenize import casual_tokenize, RegexpTokenizer, TreebankWordTokenizer
from nltk.util import ngrams
import pandas as pd
import math

In [18]:
###########
# Settings & constants
###########

en_text_sample = "data/test.txt"
ru_text_sample = "data/rutest.txt"

In [91]:
"""Preprocess routine from text"""

# Preprocess:
# read -> decode - > decapitalize -> strip service symbols
# tokenize (word, char)
# create vocab


def read_text(filepath):
    
    with open(filepath, 'rb') as fp:
        lines = []
        for line in fp:
            line = line.strip().lower().decode("ascii", "ignore")
            if len(line) == 0:
                continue
            lines.append(line)
    text = " ".join(lines)
    return text


def get_tokens(text):
    pattern = re.compile(r'[-\s.,:!?]+')
    tokens = pattern.split(text)
    return tokens


def filter_tokens(tokens):
    tokens = [x for x in tokens if x and x not in '-\t\n.,;!?\'']
    return tokens



def get_dicts(words):
    word2index = {w: i for i, w in enumerate(words)}
    index2word = {i: w for i, w in enumerate(words)}    
    return word2index, index2word
    

def process_tokens(text):
    return Counter(text.split())
    

def preprocess(filepath):
    text = read_text(filepath)
    tokens = get_tokens(text)
    tokens = filter_tokens(tokens)
    word2index, index2word = get_dicts(tokens)
    countered_tokens = process_tokens(text)
    return word2index, index2word, countered_tokens


def get_bow(tokens, most_common=0):
    bag_of_words = Counter(tokens)
    if most_common:
        bag_of_words = bag_of_words.most_common(most_common)
    return bag_of_words


def calc_cos_sim(vec1, vec2):
    
    """
    Calculates cosine distance similarity.
    Input: vec1, vec2: pd.Series
    """
    
    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]
    
    dot_prod = 0
    for i, v in enumarate(vec1):
        dot_prod += v * vec2[i]
        
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    
    return dot_prod / (mag_1 * mag_2)

In [39]:
text = read_text(en_text_sample) # Deploy text
ru_text = read_text(ru_text_sample)

In [41]:
word2index, index2word, bag_of_words = preprocess(en_text_sample)

In [42]:
{k:index2word[k] for k in np.arange(1,10) if k in index2word}

{1: 'one',
 2: 'down',
 3: 'the',
 4: 'rabbit',
 5: 'hole',
 6: 'alice',
 7: 'and',
 8: 'her',
 9: 'big'}

In [43]:
def one_hot_vectorizer(sequence):
    
    """One-hot vectorizer"""
    
    token_sequence = str.split(sequence)
    vocab = sorted(set(token_sequence))
    vocab_size = len(vocab)
    num_tokens = len(token_sequence)
    onehot_vectors = np.zeros((num_tokens, vocab_size))
    
    for i, word in enumerate(token_sequence):
        onehot_vectors[i, vocab.index(word)] = 1
    
    print(f"Perform one-hot vectorize...\n processed tokens: {num_tokens}\n vocabulary size: {vocab_size}")
    return onehot_vectors

In [44]:
wv = one_hot_vectorizer(text)
wv

Perform one-hot vectorize...
 processed tokens: 7690
 vocabulary size: 1454


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [67]:
custom_tokens = get_tokens(text)
filtered_tokens = filter_tokens(custom_tokens)

nltk_tokens = casual_tokenize(text)
regexp_tokens = RegexpTokenizer(r'\w+|$[0-9.]+\S+').tokenize(text)
treebank_tokens = TreebankWordTokenizer().tokenize(text)

print(f"Custom tokens {len(custom_tokens)} -> {len(filtered_tokens)}\n{list(filtered_tokens)[-10:]}\n")
print(f"NLTK casual tokens {len(nltk_tokens)}:\n{nltk_tokens[-10:]}\n")
print(f"NLTK regexp tokens {len(regexp_tokens)}:\n{regexp_tokens[-10:]}\n")
print(f"NLTK Treebank tokens {len(treebank_tokens)}:\n{treebank_tokens[-10:]}")

Custom tokens 8126 -> 7677
['tea', 'cried', 'alice', 'and', 'she', 'got', 'up', 'and', 'ran', 'home']

NLTK casual tokens 10237:
['alice', '.', 'and', 'she', 'got', 'up', 'and', 'ran', 'home', '.']

NLTK regexp tokens 8005:
['tea', 'cried', 'alice', 'and', 'she', 'got', 'up', 'and', 'ran', 'home']

NLTK Treebank tokens 9331:
['cried', 'alice.', 'and', 'she', 'got', 'up', 'and', 'ran', 'home', '.']


### Re phrase processors 

In [None]:
def preprocess_string(string):
    res = string.lower().decode("ascii", "ignore") 
    return res


def re_greet_detector(string):
    r = r"[^a-z]*([y]o|[h']?ello|ok|hey|(good[ ])?(morn[gin']{0,3}}"\
             r"afternoon|even[gin']{0,3}))[\s,;:]{1,3}([a-z]{1,20})"
    res = re.compile(r, flags=re.IGNORECASE)
    res = res.match(string)
    return res

### Stop words

In [76]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
nltk.download('stopwords')
nltk_stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nickel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [81]:
len(sklearn_stop_words), len(nltk_stop_words)

(318, 179)

### lemmatize

In [82]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nickel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [89]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('better', pos="a")  # a - adjective

'good'