# TAHLR Week 7: Preparing Textual Data

Code notebook for TAHLR course at ISAW (Fall 2023) based on Albrecht et al. 2022 (Blueprints) Ch. 4: Preparing Textual Data for Statistics and Machine Learning

In [None]:
# # Installs
# ! pip install -U textacy

In [None]:
# # # Get data from remote location

# !mkdir -p ../data/blueprints
# !curl -LJO https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/data/reddit-selfposts/rspct_autos.tsv.gz --output-dir ../data/blueprints

# !curl -LJO https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/data/reddit-selfposts/subreddit_info.csv.gz --output-dir ../data/blueprints


In [None]:
# Imports / setup

import nltk

import pandas as pd
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

import spacy
# spacy.cli.download('en_core_web_sm') # Download if necessary

In [None]:
# Load data

posts_file = "../data/blueprints/rspct_autos.tsv.gz"
posts_df = pd.read_csv(posts_file, sep='\t')

subred_file = "../data/blueprints/subreddit_info.csv.gz"
subred_df = pd.read_csv(subred_file).set_index(['subreddit'])

df = posts_df.join(subred_df, on='subreddit')
df.sample(2)

## Blueprint: Standardizing attribute names

In [None]:
print(df.columns)

In [None]:
column_mapping = {
    'id': 'id',
    'subreddit': 'subreddit',
    'title': 'title',
    'selftext': 'text',
    'category_1': 'category',
    'category_2': 'subcategory',
    'category_3': None, # no data
    'in_data': None, # not needed
    'reason_for_exclusion': None # not needed
}

# define remaining columns
columns = [c for c in column_mapping.keys() if column_mapping[c] != None]

# select and rename those columns
df = df[columns].rename(columns=column_mapping)

In [None]:
df = df[df['category'] == 'autos']

In [None]:
df.sample(1).T

In [None]:
df.to_pickle("../data/blueprints/reddit_dataframe.pkl")

## Blueprint: Identify noise with regular expressions

In [None]:
text = """
After viewing the [PINKIEPOOL Trailer](https://www.youtu.be/watch?v=ieHRoHUg)
it got me thinking about the best match ups.
<lb>Here's my take:<lb><lb>[](/sp)[](/ppseesyou) Deadpool<lb>[](/sp)[](/ajsly)
Captain America<lb>"""

In [None]:
import re

RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')

def impurity(text, min_len=10):
    """returns the share of suspicious characters in a text"""
    if text == None or len(text) < min_len:
        return 0
    else:
        return len(RE_SUSPICIOUS.findall(text))/len(text)

print(impurity(text))

In [None]:
# add new column to data frame
df['impurity'] = df['text'].apply(impurity, min_len=10)

# get the top 3 records
df[['text', 'impurity']].sort_values(by='impurity', ascending=False).head(3)

In [None]:
# From last week

from collections import Counter

def count_words(df, column='tokens', preprocess=None, min_freq=2):

    # process tokens and update counter
    def update(doc):
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(tokens)

    # create counter and run through all data
    counter = Counter()
    df[column].progress_map(update)

    # transform counter into data frame
    freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
    freq_df = freq_df.query('freq >= @min_freq')
    freq_df.index.name = 'token'
    return freq_df.sort_values('freq', ascending=False)

In [None]:
# from blueprints.exploration import count_words
count_words(df, column='text', preprocess=lambda t: re.findall(r'<[\w/]*>', t))

## Blueprint: Removing noise with regular expressions

In [None]:
import html

def clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text)
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
# Test preprocessing

clean_text = clean(text)
print(clean_text)
print("Impurity:", impurity(clean_text))

In [None]:
df['clean_text'] = df['text'].map(clean)
df['impurity']   = df['clean_text'].apply(impurity, min_len=20)

## Blueprint: Character normalization with textacy

In [None]:
text = "The café “Saint-Raphaël” is loca-\nted on Côte dʼAzur."

In [None]:
import textacy.preprocessing.normalize as tnorm
import textacy.preprocessing.remove as tremove

# textacy.preprocessing.normalize.hyphenated_words

def normalize(text):
    text = tnorm.hyphenated_words(text)
    text = tnorm.quotation_marks(text)
    text = tnorm.unicode(text)
    text = tremove.accents(text)
    return text

In [None]:
print(normalize(text))

## Blueprint: Pattern-Based Data Masking with textacy

In [None]:
from textacy.preprocessing.resources import RE_URL

count_words(df, column='clean_text', preprocess=RE_URL.findall).head(3)

In [None]:
from textacy.preprocessing.replace import urls as replace_urls

text = "Check out https://spacy.io/usage/spacy-101"

# using default substitution _URL_
print(replace_urls(text))

In [None]:
df['clean_text'] = df['clean_text'].map(replace_urls)
df['clean_text'] = df['clean_text'].map(normalize)

In [None]:
df.rename(columns={'text': 'raw_text', 'clean_text': 'text'}, inplace=True)
df.drop(columns=['impurity'], inplace=True)
df

In [None]:
## Blueprint: Tokenization with regular expressions

In [None]:
text = """
2019-08-10 23:32: @pete/@louis - I don't have a well-designed
solution for today's problem. The code of module AC68 should be -1.
Have to think a bit... #goodnight ;-) 😩😬"""

tokens = re.findall(r'\w\w+', text)
print(*tokens, sep='|')

In [None]:
RE_TOKEN = re.compile(r"""
               ( [#]?[@\w'’\.\-\:]*\w     # words, hashtags and email addresses
               | [:;<]\-?[\)\(3]          # coarse pattern for basic text emojis
               | [\U0001F100-\U0001FFFF]  # coarse code range for unicode emojis
               )
               """, re.VERBOSE)

def tokenize(text):
    return RE_TOKEN.findall(text)

tokens = tokenize(text)
print(*tokens, sep='|')

In [None]:
import nltk

tokens = nltk.tokenize.word_tokenize(text)
print(*tokens, sep='|')

## Working with spaCy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
nlp.pipeline

In [None]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

In [None]:
for token in doc:
    print(token, end="|")

In [None]:
def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_,
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_,
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)

    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df

In [None]:
display_nlp(doc)

## Blueprint: Working with stop words

In [None]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

In [None]:
nlp.vocab['down'].is_stop = False
nlp.vocab['Dear'].is_stop = True
nlp.vocab['Regards'].is_stop = True

In [None]:
non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

## Blueprint: Extracting Lemmas Based on Part of Speech

In [None]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

print(*[t.lemma_ for t in doc], sep='|')

In [None]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

In [None]:
import textacy

tokens = textacy.extract.words(doc,
            filter_stops = True,           # default True, no stopwords
            filter_punct = True,           # default True, no punctuation
            filter_nums = True,            # default False, no numbers
            include_pos = ['ADJ', 'NOUN'], # default None = include all
            exclude_pos = None,            # default None = exclude none
            min_freq = 1)                  # minimum frequency of words

print(*[t for t in tokens], sep='|')

In [None]:
def extract_lemmas(doc, **kwargs):
    return [t.lemma_ for t in textacy.extract.words(doc, **kwargs)]

lemmas = extract_lemmas(doc, include_pos=['ADJ', 'NOUN'])
print(*lemmas, sep='|')

## Blueprint: Extracting noun phrases

In [None]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

patterns = ["POS:ADJ POS:NOUN:+"]
spans = textacy.extract.token_matches(doc, patterns=patterns)
print(*[s.lemma_ for s in spans], sep='|')

In [None]:
print(*doc.noun_chunks, sep='|')

In [None]:
def extract_noun_phrases(doc, preceding_pos=['NOUN'], sep='_'):
    patterns = []
    for pos in preceding_pos:
        patterns.append(f"POS:{pos} POS:NOUN:+")
    spans = textacy.extract.token_matches(doc, patterns=patterns)
    return [sep.join([t.lemma_ for t in s]) for s in spans]

print(*extract_noun_phrases(doc, ['ADJ', 'NOUN']), sep='|')

In [None]:
## Blueprint: Extracting named entities

text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

In [None]:
from spacy import displacy

displacy.render(doc, style='ent')

In [None]:
def extract_entities(doc, include_types=None, sep='_'):

    ents = textacy.extract.entities(doc,
             include_types=include_types,
             exclude_types=None,
             drop_determiners=True,
             min_freq=1)

    return [sep.join([t.lemma_ for t in e])+'/'+e.label_ for e in ents]

In [None]:
print(extract_entities(doc, ['PERSON', 'GPE']))

## Feature extraction

## Blueprint: Creating one function to get it all

In [None]:
def extract_nlp(doc):
    return {
    'lemmas'          : extract_lemmas(doc,
                                     exclude_pos = ['PART', 'PUNCT',
                                        'DET', 'PRON', 'SYM', 'SPACE'],
                                     filter_stops = False),
    'adjs_verbs'      : extract_lemmas(doc, include_pos = ['ADJ', 'VERB']),
    'nouns'           : extract_lemmas(doc, include_pos = ['NOUN', 'PROPN']),
    'noun_phrases'    : extract_noun_phrases(doc, ['NOUN']),
    'adj_noun_phrases': extract_noun_phrases(doc, ['ADJ']),
    'entities'        : extract_entities(doc, ['PERSON', 'ORG', 'GPE', 'LOC'])
    }

In [None]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for col, values in extract_nlp(doc).items():
    print(f"{col}: {values}")

In [None]:
nlp_columns = list(extract_nlp(nlp.make_doc('')).keys())
print(nlp_columns)

## Blueprint: Using spaCy on a large dataset

In [None]:
df['text'] = df['title'] + ': ' + df['text']

In [None]:
for col in nlp_columns:
    df[col] = None

In [None]:
df

In [None]:
batch_size = 50

for i in tqdm(range(0, len(df), batch_size)):
    docs = nlp.pipe(df['text'][i:i+batch_size])

    for j, doc in enumerate(docs):
        for col, values in extract_nlp(doc).items():
            df[col].iloc[i+j] = values

In [None]:
count_words(df, 'noun_phrases').head(10).plot(kind='barh').invert_yaxis()