# Text preprocessing toolkit

#### Scripts:
1. Twitter data preprocessing: hashtags, mentions gathering and removal
2. Get and remove URL from text
3. HTML decoding
4. UTF-8 BOM (Byte Order Mark)
5. Remove all special characters from string
6. Lowercase all string characters
7. Remove string inner spaces (any number of extra spaces)
8. Remove stop words (NLTK)
9. Tokenize words (NLTK)
10. Stem words (Porter, Lancaster) (NLTK)
11. Lemmatizing words (NLTK WordNetLemmatizer)
12. Vectorize text with Bag of Words (sklearn)

#### Added:
13. Spacy tokens - Words, lemmas, POS, TAG, DEP, heads, puncts, spaces, indexes, shapes (Spacy)
14. Dependency prasing & visualisation (Spacy)
15. NER Entities on display

#### Soon:
16. Smaler text parts: sentences, chunks, words and characters
17. Bigger text parts: context
18. Sentiment Analysis models
19. More on bag of words and n-grams
20. Text classifications scripts
21. Word distances
22. Chunkbuilding

#### Looking to add more on:
- Entity recognition
- Sentence recognition
- Word-to-vector transformations
- More methods for cleaning and normalising text
- More on classifying / categorizing / organizing records
- Clustering records
- Extracting topics
- Keyword / key phrase extraction
- Duplicate and near-duplicate detection
- Semantic search
- Extracting acronyms and their definitions
- Extracting key entities like people, company, product, location, dates, etc.
- The original web pages which provided the content
- Inner text entities IDs
- 

(To be re-organised, enriched with examples & developed further.)

In [1]:
# 1. Twitter data preprocessing: hashtags and mentions 
# --------------------------
# 1. get hashtags list
# 2. get mentions list
# 3. remove hashtags from text
# 4. remove mentions from text
# 5. remove hashtags and mentions from text
# 6. get hashtags and mentions list and remove them from text (Depending on previous funcs)
# 7. point 6 autonomic alternative

def get_mentions(text):
    return ', '.join([w for w in text.split(' ') if w.startswith('@')])
def get_hashtags(text):
    return ', '.join([w for w in text.split(' ') if w.startswith('#')])
def remove_mentions(text):
    return ' '.join([w for w in text.split(' ') if not w.startswith('@')])
def remove_hashtags(text):
    return ' '.join([w for w in text.split(' ') if not w.startswith('#')])
def remove_hashtags_and_mentions(text):
    return ' '.join([w for w in text.split(' ') if not w.startswith('#') and not w.startswith('@')])

def get_and_remove_hashtags_and_mentions_from_text_DEP(text):
    return get_hashtags(text), get_mentions(text), remove_hashtags_and_mentions(text)

def get_and_remove_hashtags_and_mentions_from_text_AUTO(text):
    hashtags, mentions, cleantxt = [],[],[]
    for word in text.split(' '):
        if   word.startswith('#'): hashtags.append(word)
        elif word.startswith('@'): mentions.append(word)
        else:                      cleantxt.append(word)
    return ', '.join(hashtags),  ', '.join(mentions),  ' '.join(cleantxt)

In [2]:
# 2. Get and remove URL from text
# ------------------------------
# a. get url as a string
# b. get text without url
# c. get url and text without url

import re

def get_url(text):
    # Returns '' or url string
    try: 
        return re.search("(?P<url>https?://[^\s]+)", text).group("url")
    except: 
        return ''
def remove_url(text):
    # Returns string without url
    return re.sub('https?://[A-Za-z0-9./]+','',text)

def get_and_remove_url_from_text(text):
    # returns '' or url string and string without url
    return get_url(text), remove_url(text)

In [None]:
# 3. HTML decoding
# ----------------

from bs4 import BeautifulSoup

# HTML decoding (works same)
def html_strip_lxml(text):
    return BeautifulSoup(text, 'lxml').get_text()
def html_strip_praser(text):
    return BeautifulSoup(text, "html.parser").get_text()

In [None]:
# 4. UTF-8 BOM (Byte Order Mark)
# ------------------------------

def get_BOM_in_order(text):
    try:
        return text.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        return text

In [15]:
# 5. Remove all special characters from string
# --------------------------------------------

import re

def remove_special_characters(text):
    return re.sub("[^a-zA-Z]", " ", text)

In [16]:
# 6. Lowercase all string characters
# ----------------------------------

def lowercase_text(text):
    return text.lower()

In [17]:
# 7. Remove string inner spaces (any number of extra spaces)
# ----------------------------------------------------------

def strip_inner_spaces(text):
    return ' '.join([w.strip() for w in text.split()])

In [19]:
# 8. Remove stop words (NLTK)
# ---------------------------

import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords') 

def remove_stop_words(text):
    return ' '.join([w for w in text.split() if not w in set(stopwords.words('english'))])

In [18]:
# 9. Tokenize words (NLTK)
# ------------------------

from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

def tokenize_words(text):
    words = tok.tokenize(text)
    return (" ".join(words)).strip()

In [None]:
# 10. Stem words (Porter, Lancaster) (NLTK)
# -----------------------------------------

import nltk


# Porter
from nltk.stem.porter import PorterStemmer 
PS = PorterStemmer()

def stem_words_porter(text, PS):
    return ' '.join([PS.stem(w) for w in text.split()]) 


# Lancaster
from nltk.stem import LancasterStemmer
LS = LancasterStemmer()

def stem_words_lancaster(text, LS):
    return ' '.join([LS.stem(w) for w in text.split()]) 

In [None]:
# 11. Lemmatizing words (NLTK WordNetLemmatizer)
# -------------------------------------

import nltk
from nltk.stem import WordNetLemmatizer
WNL = WordNetLemmatizer()

def lemmatize_words(text, WNL):
    return ' '.join([WNL.lemmatize(word, pos='v') for word in text.split()])

In [None]:
# 12. Vectorize text with Bag of Words (sklearn)
# ----------------------------------------------

from sklearn.feature_extraction.text import CountVectorizer 
  
def vectorize_texts(all_texts_list, max_features=1000):
    cv = CountVectorizer(max_features=max_features) # Play free with max_features 
    vec_matrix = cv.fit_transform(raw_documents=all_texts_list).toarray()
    print('Vectorised {} texts into {} features.'.format(vec_matrix.shape[0], vec_matrix.shape[1]))
    return vec_matrix

In [309]:
# 13. Spacy tokens  [Words, lemmas, POS, TAG, DEP, heads, puncts, spaces, indexes, shapes]
# ----------------------------------------------------------------------------------------

import pandas as pd

import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

def get_text_tokens(text, tokens_to_get, string=False):
    '''
    text:           tweet, sentence, paragraph.
    tokens_to_get:  pick symbol letters 'hidlwsit.p'
    '''
    
    # Load text to nlp
    doc = nlp(text)

    # Token-type containers for text
    tokens = {'words':[], 'lemmas':[], 
              'poses':[], 'tags':[], 'deps':[], 'heads':[],
              'puncts':[], 'spaces':[], 'idxes':[], 'shapes':[],
              'raw_text':[], 'w_idx':[]}
    
    # Get token of each (declared) type
    for i, token in enumerate(doc):
        if 'w' in tokens_to_get: tokens['words'].append(  token.text)
        if 'l' in tokens_to_get: tokens['lemmas'].append( token.lemma_)
        if 'd' in tokens_to_get: tokens['deps'].append(   token.dep_)
        if 'h' in tokens_to_get: tokens['heads'].append(  token.head.text)
        if 't' in tokens_to_get: tokens['tags'].append(   token.tag_)
        if 'p' in tokens_to_get: tokens['poses'].append(  token.pos_)
        if 'i' in tokens_to_get: tokens['idxes'].append(  token.idx)
        if 's' in tokens_to_get: tokens['shapes'].append( token.shape_)
        if '.' in tokens_to_get: tokens['puncts'].append( token.is_punct)
        if ' ' in tokens_to_get: tokens['spaces'].append( token.is_space)

        if not string: tokens['w_idx'].append(i)    
    
    
    # Fill empty lists with '', None or 0
    for k, v in tokens.items():
        if len(tokens[k]) == 0:
            tokens[k] = ['']*len(doc) # None, 0, '', 'placeholder'
    
    if string:
    # Get all words output as one string fg. 'ADJ NOUN PRON VERB VERB ADP PROPN PUNCT'
        for k, v in tokens.items():
            if not v[0] is str:
                v = [str(a) for a in v]
            tokens[k] = ' '.join(v)
        # String of words indexes ('0 1 2 3 4 ...')
        tokens['w_idx'] = ' '.join([str(i) for i in range(len(doc))])
    
    # Add raw text
    tokens['raw_text'] = text
    
    return tokens


text = "How to store text features in pandas or CSV, Mr. Mighty Michaltronix?"

# One token lst / str
deps = [w.dep_ for w in nlp(text)]
print(deps)

# Quick DF
# df['new_col'] = df['text'].apply(lambda x: nlp(x))
# df['new_col'] = df['col'].apply(lambda x: list(nlp(x).ents))
# LMG COMMENT: For starters, my original code wasn't properly calling the function. When I fixed that by using df['new_col'] = [token for token in parser(df['col'])] I got an error "expecting string got series" which I was able to fix using df['new_col'] = df['col'].apply(lambda x: nlp(x))

# Dict of strings / DF row
tokens = get_text_tokens(text, 'wlpdhtsi .', string=True) # 'wlpdhtsi .'
display( pd.DataFrame(tokens, index=[0]) )

# Dict of lists / DF matrix
tokens = get_text_tokens(text, 'wlpdhtsi .', string=False) # 'wlpdhtsi .'
display (  pd.DataFrame(tokens, index=tokens['words']).iloc[:,1:] )





['advmod', 'aux', 'advcl', 'compound', 'dobj', 'prep', 'pobj', 'cc', 'conj', 'punct', 'compound', 'compound', 'ROOT', 'punct']


Unnamed: 0,words,lemmas,poses,tags,deps,heads,puncts,spaces,idxes,shapes,raw_text,w_idx
0,"How to store text features in pandas or CSV , ...","how to store text feature in panda or csv , mr...",ADV PART VERB NOUN NOUN ADP NOUN CCONJ PROPN P...,"WRB TO VB NN NNS IN NNS CC NNP , NNP NNP NNP .",advmod aux advcl compound dobj prep pobj cc co...,store store Michaltronix features store featur...,False False False False False False False Fals...,False False False False False False False Fals...,0 4 7 13 18 27 30 37 40 43 45 49 56 68,"Xxx xx xxxx xxxx xxxx xx xxxx xx XXX , Xx. Xxx...","How to store text features in pandas or CSV, M...",0 1 2 3 4 5 6 7 8 9 10 11 12 13


Unnamed: 0,lemmas,poses,tags,deps,heads,puncts,spaces,idxes,shapes,raw_text,w_idx
How,how,ADV,WRB,advmod,store,False,False,0,Xxx,"How to store text features in pandas or CSV, M...",0
to,to,PART,TO,aux,store,False,False,4,xx,"How to store text features in pandas or CSV, M...",1
store,store,VERB,VB,advcl,Michaltronix,False,False,7,xxxx,"How to store text features in pandas or CSV, M...",2
text,text,NOUN,NN,compound,features,False,False,13,xxxx,"How to store text features in pandas or CSV, M...",3
features,feature,NOUN,NNS,dobj,store,False,False,18,xxxx,"How to store text features in pandas or CSV, M...",4
in,in,ADP,IN,prep,features,False,False,27,xx,"How to store text features in pandas or CSV, M...",5
pandas,panda,NOUN,NNS,pobj,in,False,False,30,xxxx,"How to store text features in pandas or CSV, M...",6
or,or,CCONJ,CC,cc,pandas,False,False,37,xx,"How to store text features in pandas or CSV, M...",7
CSV,csv,PROPN,NNP,conj,pandas,False,False,40,XXX,"How to store text features in pandas or CSV, M...",8
",",",",PUNCT,",",punct,Michaltronix,True,False,43,",","How to store text features in pandas or CSV, M...",9


In [295]:
# 14. Dependency prasing & visualisation
# --------------------------------------

import spacy
from spacy import displacy


text = 'Content understanding can never be complete without some human intervention.'

def get_dependencies(text):
    return [w.dep_ for w in nlp(text)]
print('Dependencies as a list:', get_dependencies(text))


# Display(cy)
displacy.render(nlp(text), style='dep', jupyter=True, options={'distance': 100})

Dependencies: ['compound', 'nsubj', 'aux', 'neg', 'ROOT', 'acomp', 'prep', 'det', 'amod', 'pobj', 'punct']


In [8]:
# 15. NER Entities on display
# ---------------------------

import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
 
doc = nlp('It started 24th of June, he said, 30 minutes before midnight, quite surprising just as two days ago.')
displacy.render(doc, style='ent', jupyter=True)
