### Notebook for building the text pre-processing pipeline

Rough outline of the pipeline:
1. remove punctuation, numbers, capitalization from all text. Return full string rather than individual tokens
2. replace alternatives/synonyms with a single token for that word
3. isolate the key word of interest in case it appears in any hashtags
4. label that token based on party affiliation
5. remove stopwords, tokenize, lematize text

In [1]:
import pandas as pd
import pickle
# import demoji
from datetime import datetime
import spacy
import string
import os
import re

## Recode PrepDocs

In [117]:
class TextPrep:
    """
    Class for either tokenizing or lemmatizing a corpus of documents
    
    key_words is a list of words you're interested in locating in the text
    
    key_synonyms is meant to be a dictionary of synonyms and their associated key tokens.
    This is helpful if used in conjunction with the replace_keyword method so that all
    synonyms can be replaced by a single token.
    
    twitter_preprocess does basic cleaning for twitter text
    
    replace_keyword is used to replace synonyms with a single keyword
    
    tag_tokens will tag keywords found within a text with the appropriate label
    
    spacy_tokenizer and spacy_lemmatizer remove digits, punctuation, stopwords, and returns
    either the token or lemma
    """
    def __init__(self, model = 'en', stopwords = None, key_words = None, key_synonyms = None):
        self.model = model
        self.stopwords = stopwords
        self.key_words = key_words
        self.key_synonyms = key_synonyms
    
    def twitter_preprocess(self, text):
        """
        This functions does some preliminary cleaning for twitter text. it will:
        1. Remove all pound symbols from text
        2. If key_words is defined, isolate key words from hashtags so they can be treated as individual tokens
        3. Remove emoji
        4. Remove websites from the text
        5. Convert to lower case
        """
        # remove pound signs
        #text = re.sub('#', '', text)
        # remnove @ symbol
        #text = re.sub('@', '', text)
        # remove websites
        text = re.sub('https:\S*', '', text)
        # remove all punctuation and digits
        text = re.sub('[^\w\s_]|_|\d','',text)
        # remove excess white space between words
        text = re.sub('\s+', ' ', text)
        # remove leading and trailing white space
        text = text.strip()
        # convert to lower case
        text = text.lower()
        # remove emoji, commented out because regular expressions remove them currently
        #text = demoji.replace(text, '')
        return text
        
    def replace_synonyms(self, keyword, text):
        """
        Called by the tag_keywords method.
        
        Uses the key_synonyms dictionary to locate synonyms and
        replace them with their associated key word. Will not work
        if the key word is not found in the key_synonyms dictionary
        """
        # subset key words dictionary to only those with the selected key word
        dictionary = {key:value for key, value in self.key_synonyms.items() if value == keyword}
        # Create a regular expression  from the dictionary keys
        regex = re.compile("(%s)" % "|".join(map(re.escape, dictionary.keys())))
        # For each match, look-up corresponding value in dictionary
        return regex.sub(lambda mo: dictionary[mo.string[mo.start():mo.end()]], text)
    
    def tag_keywords(self, keyword, text, tag):
        """
        Finds all instances of a user defined token within a specific corpus and tags it to
        a specific group. Pass a list of text documents, and a list of associated tags for those documents
        such as two columns from a data frame. 
        """
        # replace synonyms of the key words with the key word
        text = self.replace_synonyms(keyword, text)
        # isolate the key word with space so it can be treated as an individual token
        text = re.sub(keyword, ' ' + keyword + ' ', text)
        # Add the group label
        text = re.sub(keyword, keyword + "_" + tag, text)
        # eliminate excess whitespace created
        text = re.sub('\s+', ' ', text).strip()
        
        return text
    
    def spacy_tokenizer(self, text):
        """
        Called by the clean_file method
        Tokenizes, returns lower case, removes stop words and punctuation,
        """
        nlp = spacy.load(self.model, disabled = ['tagger', 'parser', 'ner', 'textcat'])
        # tokenize
        text = self.nlp(text)
        # take the token unless it is a stopword
        text = [token.text.lower().strip() for token in text if not token.is_stop]
        return text

    def spacy_lemmatizer(self, text):
        """
        Called by the lemmatize_file method
        Tokenizes, lemmatizes, returns lower case, removes stop words and punctuation
        """
        # reload spacy with pat of speech tagger
        nlp = spacy.load(self.model, disable = ['parser', 'ner', 'textcat'])
        # tokenize
        text = nlp(text)
        # take the lemma unless it is a stopword
        text = [token.lemma_.lower().strip() for token in text if not token.is_stop]
        return text
    
    def multi_lemmatizer(self, text_list, threads):
        """
        Called by the lemmatize_file method
        Tokenizes, lemmatizes, returns lower case, removes stop words and punctuation
        """
        # reload spacy with pat of speech tagger
        nlp = spacy.load(self.model, disable = ['parser', 'ner', 'textcat'])
        
        text_lemmas = []
        for text in nlp.pipe(text_list, n_threads = threads):
            tokens = [token.lemma_.lower().strip() for token in text if not token.is_stop]
            text_lemmas.append(tokens)
        return text_lemmas

### Remove hashtags

In [177]:
def remove_hashtags(self, text):
    text = re.sub('#', '', text)
    return text

### Insert spaces between key words

In [175]:
def isolate_keyword(self, word, text):
    text = re.sub(word, ' ' + word + ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

### Key word dicitonary

In [252]:
# a list of all key tokens
key_tokens = ['trump',
'president',
'healthcare',
'border',
'wall',
'democrat',
'republican',
'liberal',
'conservative',
'abortion',
'clinton',
'sanders',
'socialist',
'economy',
'jobs',
'impeach',
'obama',
'russia',
'mueller',
'collusion',
'military',
'budget',
'market',
'trade',
'vote',
'democracy',
'gun',
'agriculture',
'women',
'business',
'tax',
'medicare',
'police',
'immigration',
'insurance',
'climatechange',
'corrupt',
'electoralcollege',
'judge',
'court',
'gerrymander',
'pelosi',
'mcconnell',
'mikepence',
'citizensunited',
'daca',
'dreamers',
'lgbtq',
'ACA',
'scotus',
'partisan',
'patriot',
'welfare',
'privilege',
'minority',
'islam',
'muslim',
'christian',
'god',
'religion',
'administration',
'politics',
'fair',
'witchhunt',
'warren',
'biden',
'security',
'terrorism',
'defense',
'pentagon',
'homelandsecurity',
'senate',
'wealth',
'american',
'church',
'science',
'stockmarket',
'congress',
'whitehouse',
'constitution',
'federal',
'syria',
'northkorea',
'saudiarabia',
'mexico',
'debt',
'fiscal',
'oil',
'media',
'cnn',
'fox',
'news',
'racist',
'refugee',
'education',
'maga',
'campaign',
'party',
'poll']

f = open("Meta Data/key_tokens.pkl","wb")
pickle.dump(key_tokens,f)
f.close()

In [223]:
# returns the synonyms for each key token
for key, token in key_synonyms.items():
    if token == key_tokens[0]:
        print(key)

donald trump
president trump


In [55]:
# this dicitonary can be used to replace key word synonyms with a single token
# get a list of the keys relevant, pass to the replace_keyword function

key_synonyms = {
    'realdonaldtrump': 'trump',
    'realdonald trump': 'trump',
    'donaldtrump': 'trump',
    'donald trump': 'trump',
    'presidenttrump': 'trump',
    'president trump': 'trump',
    'potus': 'trump',
    'health care': 'healthcare',
    'dems': 'democrat',
    'gop': 'republican',
    'left wing': 'liberal',
    'leftist': 'liberal',
    'progressive': 'liberal',
    'right wing': 'conservative',
    'abort': 'abortion',
    'bernie sanders': 'sanders',
    'bernie': 'sanders',
    ' econ ': ' economy ',
    'economics': 'economy',
    'impeachment': 'impeach',
    'barack obama': 'obama',
    'robert mueller': 'mueller',
    'armed forces': 'military',
    'armed services': 'military',
    'firearm': 'gun',
    'assault rifle': 'gun',
    'climate change': 'climatechange',
    'global warming': 'climatechange',
    'corruption': 'corrupt',
    'corrupted': 'corrupt',
    'electoral college': 'electoralcollege',
    'court justice': 'judge',
    'nancy pelosi': 'pelosi',
    'mitch mcconnell': 'mcconnell',
    'vice president mike pence': 'mikepence',
    'vice president pence' : 'mikepence',
    'mike pence': 'mikepence',
    ' pence ': ' mikepence ',
    'citizens united': 'citizensunited',
    'deferred action for childhood arrivals': 'daca',
    'lgbt': 'lgbtq',
    'affordable care act': 'aca',
    'obamacare': 'aca',
    'obama care': 'aca',
    'supreme court of the united states': 'scotus',
    'us supreme court': 'scotus',
    'united states supreme court': 'scotus',
    'supreme court': 'scotus',
    'islamic': 'islam',
    'christianity': ' christian',
    'political': 'politics',
    'witch hunt': 'witchhunt',
    'elizabeth warren': 'warren',
    'counterterrorism': 'terrorism',
    'counter-terrorism': 'terrorism',
    'homeland security': 'homelandsecurity',
    'us senate': 'senate',
    'united states senate': 'senate',
    'senatorial': 'senate',
    'rich': 'wealthy',
    'wealthy': ' wealth',
    'billionaire': 'wealthy',
    'temple': 'church',
    'stock market': 'stockmarket',
    'stocks': 'stockmarket',
    'congressional': 'congress',
    'white house': 'whitehouse',
    'north korean': 'northkorea',
    'north korea': 'northkorea',
    'saudi arabian': 'saudiarabia',
    'saudi arabia': 'saudiarabia',
    ' saudi ': ' saudiarabia ',
    'mexican': 'mexico',
    'fox news': 'fox',
    'cable news network': 'cnn',
    'make america great again': 'maga'
}

f = open("Meta Data/synonym_dictionary.pkl","wb")
pickle.dump(key_synonyms,f)
f.close()

In [219]:
def replace_keyword(token, replacement, text):
    text = re.sub(token, replacement, text)
    text = re.sub('\s+', ' ', text)
    return text

In [216]:
text = 'president donald trump is an american hero'
for key in list(key_words)[0:2]:
    text = replace_keywords(key, key_words[key], text)
    print(text)

### Refine stopwords list

short list of stopwords, does not deal with contractions

In [253]:
stopwords = ['i','a', 'about', 'am', 'an', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 
             'the', 'this', 'to', 'was', 'what', 'when', 'where', 'who', 'will', 'with', 'the']

f = open("Meta Data/stopwords.pkl","wb")
pickle.dump(stopwords,f)
f.close()

In [225]:
prep.stopwords

['‘re',
 'formerly',
 'everything',
 'while',
 'eleven',
 'enough',
 "'ll",
 'am',
 'seeming',
 "n't",
 'hence',
 'will',
 'did',
 'ever',
 'make',
 'with',
 'i',
 'whoever',
 'what',
 'whenever',
 'noone',
 'where',
 'anyway',
 'after',
 'unless',
 'n‘t',
 'everyone',
 'done',
 'then',
 'each',
 'whereafter',
 'show',
 'any',
 'so',
 'she',
 'that',
 'out',
 'always',
 'meanwhile',
 'next',
 'an',
 'elsewhere',
 'nobody',
 'few',
 'could',
 'therein',
 'does',
 'front',
 'himself',
 'were',
 'thereby',
 'such',
 'seems',
 'or',
 'yours',
 'beside',
 'another',
 'quite',
 'and',
 'name',
 'those',
 'do',
 'however',
 'except',
 'until',
 '’m',
 'sixty',
 'ca',
 'of',
 '‘s',
 'indeed',
 'together',
 'many',
 'moreover',
 'they',
 'around',
 "'re",
 '’s',
 'too',
 'under',
 'via',
 'toward',
 'anyhow',
 'therefore',
 'hereupon',
 'been',
 '‘m',
 'amongst',
 'should',
 'side',
 'hers',
 'hereby',
 'several',
 'when',
 'anyone',
 'namely',
 'one',
 'your',
 'please',
 'nothing',
 'otherwis

Longer list. spacy's default list of stop words minus a few key terms that may be relevant

In [370]:
from __future__ import unicode_literals

# Stop words
STOP_WORDS = set(
    """
a about after afterwards again all almost
already also although always am among amongst amount an and another any anyhow
anyone anything anyway anywhere are around as at

be became because become becomes becoming been before beforehand
being beside besides beyond both bottom but by

can cannot ca could

did do does doing done down during

each eight either eleven else elsewhere empty enough even ever every
everyone everything everywhere except

few fifteen fifty first five for former formerly forty four from front full
further

get go

had has have he hence her here hereafter hereby herein hereupon hers herself
him himself his how however hundred

i if in indeed into is it its itself

last latter latterly least less

many may me meanwhile might mine more moreover most mostly move much
must my myself

name namely neither nevertheless next nine no nobody none noone nor not
nothing now nowhere

of off often on once one only onto or other others otherwise our ours ourselves
out over own

part per perhaps please put

quite

rather re really regarding

same say see seem seemed seeming seems serious several she should show side
since six sixty so some somehow someone something sometime sometimes somewhere
still such

take ten than that the their them themselves then thence there thereafter
thereby therefore therein thereupon these they third this those though three
through throughout thru thus to together too top toward towards twelve twenty
two

under until up unless upon used using

various very very via was we well were what whatever when whence whenever where

whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would

yet you your yours yourself yourselves
""".split()
)

contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
STOP_WORDS.update(contractions)

for apostrophe in ["‘", "’"]:
    for stopword in contractions:
        STOP_WORDS.add(stopword.replace("'", apostrophe))
        
f = open("Meta Data/stopwords.pkl","wb")
pickle.dump(STOP_WORDS,f)
f.close()

### Examine Lemmas of key tokens

In [71]:
terms = pd.read_csv('Political terms.csv', header = None)
terms.tail(20)

Unnamed: 0,0
80,constitution
81,federal
82,syria
83,north korea
84,saudi arabia
85,gop
86,mexico
87,debt
88,fiscal
89,oil


In [59]:
for i in terms[0]:
    print(prep.spacy_lemmatizer(text = i))

trump
president
healthcare
border
wall
democrat
republican
liberal
conservative
abortion abort
clinton
sander
socialist
economy
job
impeachment
potus
obama
russia
mueller
collusion
military
budget
market
trade
vote
democracy
gun
firearm
assualt rifle
agriculture
woman
business
tax
medicare
police
immigration
insurance
climate
corrupt
progressive
electoral college
judge
court
gerrymander
pelosi
mcconnell
penny
citizen unite
dreamer
lgbtq
obamacare
scotus
partisan
patriot
welfare
privilege
minority
muslim
god
religion
administration
politic

fair
witch hunt
warren
biden
security
terrorism
pentagon
senate
rich
american
church
science
supreme court
stock
congress
white house
constitution
federal
syria
north korea
saudi arabia
gop
mexico
debt
fiscal
oil
medium
news
racist
refugee
education
maga
mitch
campaign
party
poll


## Key token tagger

In [265]:
text = ["find out about how #donald trump lied at https://t.co/asdfk!lasd08", 'this is second test sentance trump', 
        'did you watch the trump impeachment hearings today?']
tag_list = ['D', "R", "D"]

def tag_tokens(text_list, token, tag_list):
    for i in range(len(text_list)):
        text_list[i] = re.sub(token, token + "_" + tag_list[i], text_list[i])

In [266]:
tag_tokens(text, 'trump', tag_list)
text

['find out about how #donald trump_D lied at https://t.co/asdfk!lasd08',
 'this is second test sentance trump_R',
 'did you watch the trump_D impeachment hearings today?']

## Remove Emoji

In [268]:
aggregated_tweets = pd.read_csv('Data/aggregated_tweets.csv')
text = aggregated_tweets['text']
text[4]

In [326]:
test = 'At 123145 what point did due process become "cosmetic" in this country? I don\'t remember learning that in law school. 🤔 https://t.co/ew1hbvbWBj'

In [295]:
import demoji

In [307]:
demoji.replace(test, '')

'At what point did due process become "cosmetic" in this country? I don\'t remember learning that in law school.  https://t.co/ew1hbvbWBj'

# Testing

In [443]:
test_tokens = ['trump', 'token']
text = "find out about how @realdonaldtrump lied about obamacare at https://t.co/asdfk!lasd08"
tester = 1
def twitter_preprocess(text):
    """
    This functions does some preliminary cleaning for twitter text. it will:
    1. Remove all pound symbols from text
    2. Isolate key words located within hashtags so they can be treated as individual tokens
    3. Remove websites from the text
    """
    # remove pound signs
    text = re.sub('#', '', text)
    # isolate key words located in hashtags
    if tester is not None:
        for token in key_tokens:
            text = re.sub(token, ' ' + token + ' ', text)
            text = re.sub('\s+', ' ', text)
    # remove websites
    text = re.sub('https:\S*', '', text)
    print(text)

twitter_preprocess(text)

find out about how @realdonald trump lied about obama care at 


In [319]:
tokenizer = spacy.load('en', disable = ['tagger', 'parser', 'ner', 'textcat'])

In [349]:
test_strip = tokenizer(test)

test_strip = [token for token in test_strip if not token.is_digit | token.is_punct | token.is_stop]
test_strip

[point,
 process,
 cosmetic,
 country,
 remember,
 learning,
 law,
 school,
 🤔,
 https://t.co/ew1hbvbWBj]

## Full test

In [112]:
# load in data
file = open('Meta Data/key_tokens.pkl', 'rb')
key_tokens = pickle.load(file)

file = open('Meta Data/synonym_dictionary.pkl', 'rb')
synonyms = pickle.load(file)

file = open('Meta Data/stopwords.pkl', 'rb')
stopwords = pickle.load(file)

meta_data = pd.read_csv('Meta Data/meta_data.csv')

tweet_df = pd.read_csv('Data/aggregated_tweets.csv')


# merge data with meta data
tweet_df = pd.merge(tweet_df, meta_data, how = 'inner', on = 'user_id')

tweets = tweet_df['text']
labels = tweet_df['party']

# initialize parser
prep = PrepDocs(stopwords = stopwords, key_tokens = key_tokens, key_synonyms = synonyms)

In [113]:
%%time
tweets = [prep.twitter_preprocess(tweet) for tweet in tweets]
#tweets = [prep.replace_keyword(prep.key_tokens[0], tweet) for tweet in tweets[0:20]]
#tweets = [prep.isolate_key_tokens(tweet) for tweet in tweets]
tweets = [prep.tag_keywords('impeach', tweet, 'R') for tweet in tweets]

#tweets = [prep.spacy_lemmatizer(tweet) for tweet in tweets[0:100]]
tweets = prep.multi_lemmatizer(tweets[0:100], threads = 6)
tweets

CPU times: user 4.25 s, sys: 31.1 ms, total: 4.28 s
Wall time: 4.26 s


[['not',
  'long',
  'vote',
  'impeach_r',
  'inquiry',
  'open',
  'transparent',
  'today',
  'h',
  're',
  'learn',
  'numerous',
  'closed',
  'deposition',
  'take',
  'place',
  'week',
  's',
  'surprising',
  'stoptheschiffshow'],
 ['look',
  'forward',
  'show',
  'repmarkgreen',
  'fl',
  'week',
  'amp',
  'introduce',
  'outstanding',
  'veteran',
  'community',
  'florida'],
 ['yes',
  'vote',
  'resolution',
  'give',
  'stamp',
  'approval',
  'process',
  'damage',
  'repair',
  'watch',
  'floor',
  'speech',
  'morning',
  'florida',
  'fl',
  'impeach_r',
  'inquiry'],
 ['house',
  'vote',
  'resolution',
  'formalize',
  'impeach_r',
  'inquiry',
  'realdonaldtrump',
  'damage',
  'clear',
  'democrat',
  'interested',
  'precedent',
  'process',
  'goal',
  'remove',
  'potus',
  'office',
  'period'],
 ['dc',
  'team',
  'happy',
  'help',
  'theuso',
  'afternoon',
  'total',
  'care',
  'package',
  'assemble',
  'deployed',
  'service',
  'member',
  'thank',

In [116]:
type(tweets[0][0])

str

In [93]:
# individual test case for pipeline allowing me to test various manipulations
test_text = '  At 123145 what point @ # () *||+=did due process become potus "cosmetic" in this country? #presidenttrump I dont remember that in law school. 🤔 https://t.co/ew1hbvbWBj Donald Trump  '
test_text = prep.twitter_preprocess(test_text)
#test_text = multiple_replace('trump', test_text)
test_text = prep.tag_keywords('trump', test_text, 'R')
test_text

'at what point did due process become trump_R cosmetic in this country trump_R i dont remember that in law school trump_R'

In [400]:
test_text

In [None]:
for i in range(len(text_list)):
    text_list[i] = re.sub(token, token + "_" + tag_list[i], text_list[i])

In [439]:
prep.key_tokens

['trump',
 'president',
 'healthcare',
 'border',
 'wall',
 'democrat',
 'republican',
 'liberal',
 'conservative',
 'abortion',
 'clinton',
 'sanders',
 'socialist',
 'economy',
 'jobs',
 'impeach',
 'obama',
 'russia',
 'mueller',
 'collusion',
 'military',
 'budget',
 'market',
 'trade',
 'vote',
 'democracy',
 'gun',
 'agriculture',
 'women',
 'business',
 'tax',
 'medicare',
 'police',
 'immigration',
 'insurance',
 'climatechange',
 'corrupt',
 'electoralcollege',
 'judge',
 'court',
 'gerrymander',
 'pelosi',
 'mcconnell',
 'mikepence',
 'citizensunited',
 'daca',
 'dreamers',
 'lgbtq',
 'ACA',
 'scotus',
 'partisan',
 'patriot',
 'welfare',
 'privilege',
 'minority',
 'islam',
 'muslim',
 'christian',
 'god',
 'religion',
 'administration',
 'politics',
 'fair',
 'witchhunt',
 'warren',
 'biden',
 'security',
 'terrorism',
 'defense',
 'pentagon',
 'homelandsecurity',
 'senate',
 'wealth',
 'american',
 'church',
 'science',
 'stockmarket',
 'congress',
 'whitehouse',
 'constit