===========================================

Title: 12.2 Exercises

Author: Chad Wood

Date: 4 Mar 2022

Modified By: Chad Wood

Description: This program demonstrates building a spacy matcher to identify 'Social Cause' tweets among a large dataset of tweets.

===========================================

In [6]:
import nltk
import spacy

# Loads spacy and customized stop_words 
nlp = spacy.load('en_core_web_sm')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')
stop_words.remove('against')

In [203]:
import pandas as pd
import datetime

# Loads data and formats date column
data = pd.read_csv('data/tweets.csv').drop(['country'], axis=1)
data['date_time'] = pd.to_datetime(data.date_time)

In [204]:
import re

def normalize(corpus):   
    # General Cleansing
    corpus = corpus.apply(lambda x: re.sub(r'\S*https?:\S*', '', x)) # Removes links
    corpus = corpus.apply(lambda x: re.sub("@[A-Za-z0-9_]+", '', x)) # Removes mentions
    corpus = corpus.apply(lambda x: re.sub('#([a-zA-Z0-9_]{1,50})', '', x)) # Removes hashtags
    corpus = corpus.apply(lambda x: re.sub(r'[^a-zA-z\s]', '', str(x))) # Removes special characters
    corpus = corpus.apply(lambda x: re.sub(' +', ' ', x)) # Removes double+ spaces
    corpus = corpus.apply(lambda x: x.strip()) # Removes extra whitspaces

    # Runs text through pipeline
    clean_list = []
    tok_list = []
    for doc in nlp.pipe(corpus):
        tokens = doc
        clean_text = (' '.join(word.lemma_ # Returns roots...
                               if word.lemma_ != '-PRON-' # ...Unless POS is pronoun...
                               else word.text for word in doc # ...Then returns text for pronouns
                               if word.lemma_ not in stop_words)) # Filters stopwords
        
        tok_list.append(tokens) # Returns tokens
        clean_list.append(clean_text) # Returns clean text
        
    # Clean text to lowercase as Series
    clean_Series = pd.Series(clean_list).apply(lambda x: str(x).lower()) 
    
    return clean_Series, pd.Series(tok_list)

In [242]:
# Adds columns for normalized content and tokens
data['clean_content'], data['tokens'] = normalize(data.content)

# Removes empty rows
data = data.replace('', float('NaN')).dropna()

#### Matcher

In [251]:
def match_only(matcher, df, col_name):  
    # Filters dataframe for rows where matches are found
    return df[df[col_name].map(lambda x: len(matcher(x))) > 0]

In [295]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

pattern = [
    {'LOWER': {'IN': ['justice', 'power', 'poverty', 'prayers', 'the cause', 'movement', 'freedom', 'change']}},
    {'LEMMA': {'IN': ['people', 'answer', 'today', 'work', 'deny', 'support', 'enough', 'system']}},
    {'POS': {'IN': ['NOUN', 'ADJ', 'ADV']}}
]

matcher.add('general_cause_words', [pattern])

In [297]:
match_only(matcher, data, 'tokens')

Unnamed: 0,author,content,date_time,clean_content,tokens
15911,ladygaga,🙏 for no violence during these protests. Be th...,2016-12-11 22:57:00,caitlyn thanku part life amp use platform chan...,"(Caitlyn, thanku, for, being, a, part, of, all..."


In [276]:
from spacy.matcher import PhraseMatcher

def on_match(matcher, doc, id, matches):
      print('Matched:', matches)

matcher = PhraseMatcher(nlp.vocab)

patterns = [nlp('power to the people'), 
            nlp('roll back poverty'), 
            nlp('in your prayers')]

matcher.add('general_case_phrases', patterns, on_match=on_match)
matches = match_only(matcher, data, 'tokens')

Matched: [(11239059035322959770, 4, 7)]
Matched: [(11239059035322959770, 4, 7)]
Matched: [(11239059035322959770, 6, 9)]
Matched: [(11239059035322959770, 8, 11)]
Matched: [(11239059035322959770, 10, 13)]


In [277]:
matches

Unnamed: 0,author,content,date_time,clean_content,tokens
8495,BarackObama,Tune in at 2:30 p.m. ET to watch the President...,2015-01-07 17:56:00,effort roll back poverty roadblock opportunity...,"(With, effort, we, can, roll, back, poverty, a..."
9889,rihanna,#Vogue thank you! https://t.co/Aa6uKKtaqc,2016-05-24 01:39:00,please keep joan rivers prayer,"(Please, keep, Joan, Rivers, in, your, prayers)"
10275,rihanna,friend$hip. in. 2014. http://t.co/GNW8qtuVCF,2014-11-24 22:02:00,please keep people venezuela prayer devastatin...,"(Please, keep, the, people, of, Venezuela, in,..."
10376,rihanna,"Burning candles inside of the studio, rappin n...",2014-08-10 06:58:00,navy please keep people philippines prayer i t...,"(Navy, please, keep, the, people, of, The, Phi..."
44865,ArianaGrande,love u back https://t.co/PCSKZVgkjz,2015-07-13 21:04:00,like ask accompany i prayer important day life...,"(I, d, like, to, ask, you, all, to, accompany,..."
