# Text preprocessing toolkit

#### Scripts:
1. Twitter data preprocessing: hashtags, mentions gathering and removal
2. Get and remove URL from text
3. HTML decoding
4. UTF-8 BOM (Byte Order Mark)
5. Remove all special characters from string
6. Lowercase all string characters
7. Remove string inner spaces (any number of extra spaces)
8. Remove stop words (NLTK)
9. Tokenize words (NLTK)
10. Stem words (Porter, Lancaster) (NLTK)
11. Lemmatizing words (NLTK WordNetLemmatizer)
12. Vectorize text with Bag of Words (sklearn)


(To be re-organised, enriched with examples & developed further.)

In [1]:
# 1. Twitter data preprocessing: hashtags and mentions 
# --------------------------
# 1. get hashtags list
# 2. get mentions list
# 3. remove hashtags from text
# 4. remove mentions from text
# 5. remove hashtags and mentions from text
# 6. get hashtags and mentions list and remove them from text (Depending on previous funcs)
# 7. point 6 autonomic alternative

def get_mentions(text):
    return ', '.join([w for w in text.split(' ') if w.startswith('@')])
def get_hashtags(text):
    return ', '.join([w for w in text.split(' ') if w.startswith('#')])
def remove_mentions(text):
    return ' '.join([w for w in text.split(' ') if not w.startswith('@')])
def remove_hashtags(text):
    return ' '.join([w for w in text.split(' ') if not w.startswith('#')])
def remove_hashtags_and_mentions(text):
    return ' '.join([w for w in text.split(' ') if not w.startswith('#') and not w.startswith('@')])

def get_and_remove_hashtags_and_mentions_from_text_DEP(text):
    return get_hashtags(text), get_mentions(text), remove_hashtags_and_mentions(text)

def get_and_remove_hashtags_and_mentions_from_text_AUTO(text):
    hashtags, mentions, cleantxt = [],[],[]
    for word in text.split(' '):
        if   word.startswith('#'): hashtags.append(word)
        elif word.startswith('@'): mentions.append(word)
        else:                      cleantxt.append(word)
    return ', '.join(hashtags),  ', '.join(mentions),  ' '.join(cleantxt)

In [2]:
# 2. Get and remove URL from text
# ------------------------------
# a. get url as a string
# b. get text without url
# c. get url and text without url

import re

def get_url(text):
    # Returns '' or url string
    try: 
        return re.search("(?P<url>https?://[^\s]+)", text).group("url")
    except: 
        return ''
def remove_url(text):
    # Returns string without url
    return re.sub('https?://[A-Za-z0-9./]+','',text)

def get_and_remove_url_from_text(text):
    # returns '' or url string and string without url
    return get_url(text), remove_url(text)

In [None]:
# 3. HTML decoding
# ----------------

from bs4 import BeautifulSoup

# HTML decoding (works same)
def html_strip_lxml(text):
    return BeautifulSoup(text, 'lxml').get_text()
def html_strip_praser(text):
    return BeautifulSoup(text, "html.parser").get_text()

In [None]:
# 4. UTF-8 BOM (Byte Order Mark)
# ------------------------------

def get_BOM_in_order(text):
    try:
        return text.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        return text

In [15]:
# 5. Remove all special characters from string
# --------------------------------------------

import re

def remove_special_characters(text):
    return re.sub("[^a-zA-Z]", " ", text)

In [16]:
# 6. Lowercase all string characters
# ----------------------------------

def lowercase_text(text):
    return text.lower()

In [17]:
# 7. Remove string inner spaces (any number of extra spaces)
# ----------------------------------------------------------

def strip_inner_spaces(text):
    return ' '.join([w.strip() for w in text.split()])

In [19]:
# 8. Remove stop words (NLTK)
# ---------------------------

import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords') 

def remove_stop_words(text):
    return ' '.join([w for w in text.split() if not w in set(stopwords.words('english'))])

In [18]:
# 9. Tokenize words (NLTK)
# ------------------------

from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

def tokenize_words(text):
    words = tok.tokenize(text)
    return (" ".join(words)).strip()

In [None]:
# 10. Stem words (Porter, Lancaster) (NLTK)
# -----------------------------------------

import nltk


# Porter
from nltk.stem.porter import PorterStemmer 
PS = PorterStemmer()

def stem_words_porter(text, PS):
    return ' '.join([PS.stem(w) for w in text.split()]) 


# Lancaster
from nltk.stem import LancasterStemmer
LS = LancasterStemmer()

def stem_words_lancaster(text, LS):
    return ' '.join([LS.stem(w) for w in text.split()]) 

In [None]:
# 11. Lemmatizing words (NLTK WordNetLemmatizer)
# -------------------------------------

import nltk
from nltk.stem import WordNetLemmatizer
WNL = WordNetLemmatizer()

def lemmatize_words(text, WNL):
    return ' '.join([WNL.lemmatize(word, pos='v') for word in text.split()])

In [None]:
# 12. Vectorize text with Bag of Words (sklearn)
# ----------------------------------------------

from sklearn.feature_extraction.text import CountVectorizer 
  
def vectorize_texts(all_texts_list, max_features=1000):
    cv = CountVectorizer(max_features=max_features) # Play free with max_features 
    vec_matrix = cv.fit_transform(raw_documents=all_texts_list).toarray()
    print('Vectorised {} texts into {} features.'.format(vec_matrix.shape[0], vec_matrix.shape[1]))
    return vec_matrix