### Notebook for building the text pre-processing pipeline

Rough outline of the pipeline:
1. remove punctuation, numbers, capitalization from all text. Return full string rather than individual tokens
2. isolate the key word of interest in case it appears in any hashtags
3. replace alternatives/synonyms with a single token for that word
4. label that token based on party affiliation
5. remove stopwords, tokenize, lematize text
6. train word2vec model



In [5]:
import pandas as pd
import pickle
import demoji
from datetime import datetime

## Recode PrepDocs

In [53]:
spacy.load('en', disable = ['tagger', 'parser', 'ner', 'textcat'])

<spacy.lang.en.English at 0x7fa407fa7438>

In [35]:
import spacy
import string
import os
import re
class PrepDocs:
    """
    Class for either tokenizing or lemmatizing a corpus of documents
    
    key_tokens is a list of tokens your interested in locating in the text
    
    key_synonyms is meant to be a dictionary of synonyms and their associated key tokens.
    This is helpful if used in conjunction with the replace_keyword method so that all
    synonyms can be replaced by a single token.

    self.clean_text will do a basic cleaning that returns lower case, removes
    stop words, and removes punctuation

    slef.lemmatize_text will do the basic cleaning plus lemmatization

    """
    def __init__(self, model = 'en', disabled = ['tagger', 'parser', 'ner', 'textcat'], key_tokens = None, key_synonyms = None):
        self.nlp = spacy.load(model, disable = disabled)
        self.stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)
        self.stopwords.remove()
        self.not_allowed = string.punctuation + string.digits
        self.key_tokens = key_tokens
        self.key_synonyms = key_words

    def spacy_tokenizer(self, text):
        """
        Called by the clean_file method
        Tokenizes, returns lower case, removes stop words and punctuation,
        """
        # drop subsection numbers
        text = re.sub(r"\d\.", "", text)
        # tokenize
        text = self.nlp(text)
        # take the lemma unless it is a pronoun
        text = [tok.text.lower().strip() for tok in text]
        # drop digits
        text = [token for token in text if not token.isdigit()]
        # drop stopwords and punctuation
        text = [tok for tok in text if (tok not in self.stopwords and tok not in self.not_allowed)]
        # rejoin the tokens to a single string
        text = " ".join(text)
        return text

    def spacy_lemmatizer(self, text):
        """
        Called by the lemmatize_file method
        Tokenizes, lemmatizes, returns lower case, removes stop words and punctuation
        """
        # reload spacy with pat of speech tagger
        nlp = spacy.load('en', disable = ['parser', 'ner', 'textcat'])
        # tokenize
        text = nlp(text)
        # drop stopwords
        # take the lemma unless it is a stopword
        text = [tok.lemma_.lower().strip() for tok in text if (tok.text not in self.stopwords and tok.text not in self.not_allowed)]
        # drop digits
        text = [token for token in text if not token.isdigit()]
        # rejoins the tokens to a single string
        text = " ".join(text)
        return text
    
    def twitter_preprocess(self, text):
        """
        This functions does some preliminary cleaning for twitter text. it will:
        1. Remove all pound symbols from text
        2. If key_tokens is defined, isolate key words from hashtags so they can be treated as individual tokens
        3. Remove emoji
        4. Remove websites from the text
        """
        # remove pound signs
        text = re.sub('#', '', text)
        # isolate key words located in hashtags
        if self.key_tokens is not None:
            for token in self.key_tokens
                text = re.sub(token, ' ' + token + ' ', text)
                text = re.sub('\s+', ' ', text)
        # remove emoji
        text = demoji.replace(text, '')
        # remove websites
        text = re.sub('https:\S*', '', text)
        # remove excess white space
        text = re.sub('\s+', ' ', text)
        return text
    
    def replace_keyword(token, replacement, text):
        """
        Finds a specific token and replaces it within a given text.
        Generally this is used to replace synonyms for your key words of interest.
        Use in conjunction with the key words dicitonary
        """
        text = re.sub(token, replacement, text)
        text = re.sub('\s+', ' ', text)
        return text
    
    def tag_tokens(self, text_list, tag_list, token):
        """
        Finds all instances of a user defined token within a specific corpus and tags it to
        a specific group. Pass a list of text documents, and a list of associated tags for those documents
        such as two columns from a data frame. 
        """
        for i in range(len(text_list)):
            text_list[i] = re.sub(token, token + "_" + tag_list[i], text_list[i])

### Remove hashtags

In [177]:
def remove_hashtags(self, text):
    text = re.sub('#', '', text)
    return text

### Insert spaces between key words

In [175]:
def isolate_keyword(self, word, text):
    text = re.sub(word, ' ' + word + ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

### Key word dicitonary

In [252]:
# a list of all key tokens
key_tokens = ['trump',
'president',
'healthcare',
'border',
'wall',
'democrat',
'republican',
'liberal',
'conservative',
'abortion',
'clinton',
'sanders',
'socialist',
'economy',
'jobs',
'impeach',
'obama',
'russia',
'mueller',
'collusion',
'military',
'budget',
'market',
'trade',
'vote',
'democracy',
'gun',
'agriculture',
'women',
'business',
'tax',
'medicare',
'police',
'immigration',
'insurance',
'climatechange',
'corrupt',
'electoralcollege',
'judge',
'court',
'gerrymander',
'pelosi',
'mcconnell',
'mikepence',
'citizensunited',
'daca',
'dreamers',
'lgbtq',
'ACA',
'scotus',
'partisan',
'patriot',
'welfare',
'privilege',
'minority',
'islam',
'muslim',
'christian',
'god',
'religion',
'administration',
'politics',
'fair',
'witchhunt',
'warren',
'biden',
'security',
'terrorism',
'defense',
'pentagon',
'homelandsecurity',
'senate',
'wealth',
'american',
'church',
'science',
'stockmarket',
'congress',
'whitehouse',
'constitution',
'federal',
'syria',
'northkorea',
'saudiarabia',
'mexico',
'debt',
'fiscal',
'oil',
'media',
'cnn',
'fox',
'news',
'racist',
'refugee',
'education',
'maga',
'campaign',
'party',
'poll']

f = open("Meta Data/key_tokens.pkl","wb")
pickle.dump(key_tokens,f)
f.close()

In [223]:
# returns the synonyms for each key token
for key, token in key_synonyms.items():
    if token == key_tokens[0]:
        print(key)

donald trump
president trump


In [221]:
# this dicitonary can be used to replace key word synonyms with a single token
# get a list of the keys relevant, pass to the replace_keyword function

key_synonyms = {
    'donald trump': 'trump',
    'president trump': 'trump',
    'health care': 'healthcare',
    'dems': 'democrat',
    'gop': 'republican',
    'left wing': 'liberal',
    'leftist': 'liberal',
    'progressive': 'liberal',
    'right wing': 'conservative',
    'abort': 'abortion',
    'bernie sanders': 'sanders',
    'bernie': 'sanders',
    ' econ ': ' economy ',
    'economics': 'economy',
    'impeachment': 'impeach',
    'barack obama': 'obama',
    'robert mueller': 'mueller',
    'armed forces': 'military',
    'armed services': 'military',
    'firearm': 'gun',
    'assault rifle': 'gun',
    'climate change': 'climatechange',
    'global warming': 'climatechange',
    'corruption': 'corrupt',
    'corrupted': 'corrupt',
    'electoral college': 'electoralcollege',
    'court justice': 'judge',
    'nancy pelosi': 'pelosi',
    'mitch mcconnell': 'mcconnell',
    'vice president mike pence': 'mikepence',
    'vice president pence' : 'mikepence',
    'mike pence': 'mikepence',
    ' pence ': ' mikepence ',
    'citizens united': 'citizensunited',
    'deferred action for childhood arrivals': 'daca',
    'lgbt': 'lgbtq',
    'affordable care act': 'aca',
    'supreme court of the united states': 'scotus',
    'us supreme court': 'scotus',
    'united states supreme court': 'scotus',
    'supreme court': 'scotus',
    'islamic': 'islam',
    'christianity': ' christian',
    'political': 'politics',
    'witch hunt': 'witchhunt',
    'elizabeth warren': 'warren',
    'counterterrorism': 'terrorism',
    'counter-terrorism': 'terrorism',
    'homeland security': 'homelandsecurity',
    'us senate': 'senate',
    'united states senate': 'senate',
    'senatorial': 'senate',
    'rich': 'wealthy',
    'wealthy': ' wealth',
    'billionaire': 'wealthy',
    'temple': 'church',
    'stock market': 'stockmarket',
    'stocks': 'stockmarket',
    'congressional': 'congress',
    'white house': 'whitehouse',
    'north korean': 'northkorea',
    'north korea': 'northkorea',
    'saudi arabian': 'saudiarabia',
    'saudi arabia': 'saudiarabia',
    ' saudi ': ' saudiarabia ',
    'mexican': 'mexico',
    'fox news': 'fox',
    'cable news network': 'cnn',
    'make america great again': 'maga'
}

f = open("Meta Data/synonym_dictionary.pkl","wb")
pickle.dump(key_words,f)
f.close()

In [219]:
def replace_keyword(token, replacement, text):
    text = re.sub(token, replacement, text)
    text = re.sub('\s+', ' ', text)
    return text

In [216]:
text = 'president donald trump is an american hero'
for key in list(key_words)[0:2]:
    text = replace_keywords(key, key_words[key], text)
    print(text)

### Refine stopwords list

In [253]:
stopwords = ['i','a', 'about', 'am', 'an', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 
             'the', 'this', 'to', 'was', 'what', 'when', 'where', 'who', 'will', 'with', 'the']

f = open("Meta Data/stopwords.pkl","wb")
pickle.dump(stopwords,f)
f.close()

In [225]:
prep.stopwords

['‘re',
 'formerly',
 'everything',
 'while',
 'eleven',
 'enough',
 "'ll",
 'am',
 'seeming',
 "n't",
 'hence',
 'will',
 'did',
 'ever',
 'make',
 'with',
 'i',
 'whoever',
 'what',
 'whenever',
 'noone',
 'where',
 'anyway',
 'after',
 'unless',
 'n‘t',
 'everyone',
 'done',
 'then',
 'each',
 'whereafter',
 'show',
 'any',
 'so',
 'she',
 'that',
 'out',
 'always',
 'meanwhile',
 'next',
 'an',
 'elsewhere',
 'nobody',
 'few',
 'could',
 'therein',
 'does',
 'front',
 'himself',
 'were',
 'thereby',
 'such',
 'seems',
 'or',
 'yours',
 'beside',
 'another',
 'quite',
 'and',
 'name',
 'those',
 'do',
 'however',
 'except',
 'until',
 '’m',
 'sixty',
 'ca',
 'of',
 '‘s',
 'indeed',
 'together',
 'many',
 'moreover',
 'they',
 'around',
 "'re",
 '’s',
 'too',
 'under',
 'via',
 'toward',
 'anyhow',
 'therefore',
 'hereupon',
 'been',
 '‘m',
 'amongst',
 'should',
 'side',
 'hers',
 'hereby',
 'several',
 'when',
 'anyone',
 'namely',
 'one',
 'your',
 'please',
 'nothing',
 'otherwis

## Check Lemmas

In [239]:
prep = PrepDocs('none', 'none')
prep.stopwords = stopwords
prep.not_allowed = string.punctuation + string.digits + 'https:\S*'
prep.spacy_lemmatizer(text = 'this is my website: https://t.co/21alosdjf90')

'-pron- website https://t.co/21alosdjf90'

In [237]:
prep.stopwords

['i',
 'a',
 'about',
 'am',
 'an',
 'are',
 'as',
 'at',
 'be',
 'by',
 'for',
 'from',
 'how',
 'in',
 'is',
 'it',
 'of',
 'on',
 'or',
 'that',
 'the',
 'this',
 'to',
 'was',
 'what',
 'when',
 'where',
 'who',
 'will',
 'with',
 'the',
 'https:\\S*']

### The following terms need to be converted via regular expressions:
1. synonyms should be converted to a single term, words that won't be located in both the congressional and troll data set should be eliminated In order to catch compound words using my key tokens such as "moscowmitch" replace each "token" with " token " via regular expression

2. Remove hastags


impeachment > impeach

bernie sanders > sanders

bernie > sanders

abort > abortion

global warming > climate change

climate change > climatechange

electoral college > electoralcollege

pence > mikepence

mike pence > mikepence

citizens united > citizensunited

lgbt > lgbtq

gay > lgbtq

lesbian > lgbtq

homosexual > lgbtq

supreme court > scotus

witch hunt > witchhunt

north korea > northkorea

saudi arabia > saudi

media > newsmedia

corruption > corrupt

corrupted > corrupt

obamacare > ACA

affordable care act > ACA

donald trump > trump

assault rifle > gun

firearm > gun

potus > president

islamic > islam

racism > racist

mitch mcconnell > mitchmcconnell

mcconnell > mitchmcconnell

nancy pelosi > nancypelosi

pelosi > nancypelosi

taxing > tax

In [71]:
terms = pd.read_csv('Political terms.csv', header = None)
terms.tail(20)

Unnamed: 0,0
80,constitution
81,federal
82,syria
83,north korea
84,saudi arabia
85,gop
86,mexico
87,debt
88,fiscal
89,oil


In [59]:
for i in terms[0]:
    print(prep.spacy_lemmatizer(text = i))

trump
president
healthcare
border
wall
democrat
republican
liberal
conservative
abortion abort
clinton
sander
socialist
economy
job
impeachment
potus
obama
russia
mueller
collusion
military
budget
market
trade
vote
democracy
gun
firearm
assualt rifle
agriculture
woman
business
tax
medicare
police
immigration
insurance
climate
corrupt
progressive
electoral college
judge
court
gerrymander
pelosi
mcconnell
penny
citizen unite
dreamer
lgbtq
obamacare
scotus
partisan
patriot
welfare
privilege
minority
muslim
god
religion
administration
politic

fair
witch hunt
warren
biden
security
terrorism
pentagon
senate
rich
american
church
science
supreme court
stock
congress
white house
constitution
federal
syria
north korea
saudi arabia
gop
mexico
debt
fiscal
oil
medium
news
racist
refugee
education
maga
mitch
campaign
party
poll


## Key token tagger

In [265]:
text = ["find out about how #donald trump lied at https://t.co/asdfk!lasd08", 'this is second test sentance trump', 
        'did you watch the trump impeachment hearings today?']
tag_list = ['D', "R", "D"]

def tag_tokens(text_list, token, tag_list):
    for i in range(len(text_list)):
        text_list[i] = re.sub(token, token + "_" + tag_list[i], text_list[i])

In [266]:
tag_tokens(text, 'trump', tag_list)
text

['find out about how #donald trump_D lied at https://t.co/asdfk!lasd08',
 'this is second test sentance trump_R',
 'did you watch the trump_D impeachment hearings today?']

## Remove Emoji

In [268]:
aggregated_tweets = pd.read_csv('Data/aggregated_tweets.csv')
text = aggregated_tweets['text']
text[4]

In [279]:
test = 'At what point did due process become "cosmetic" in this country? I don\'t remember learning that in law school. 🤔 https://t.co/ew1hbvbWBj'

In [295]:
import demoji

In [307]:
demoji.replace(test, '')

'At what point did due process become "cosmetic" in this country? I don\'t remember learning that in law school.  https://t.co/ew1hbvbWBj'

# Testing

In [250]:
test_tokens = ['trump', 'token']
text = "find out about how #donaldtrumplied at https://t.co/asdfk!lasd08"
tester = 1
def twitter_preprocess(text):
    """
    This functions does some preliminary cleaning for twitter text. it will:
    1. Remove all pound symbols from text
    2. Isolate key words located within hashtags so they can be treated as individual tokens
    3. Remove websites from the text
    """
    # remove pound signs
    text = re.sub('#', '', text)
    # isolate key words located in hashtags
    if tester is not None:
        for token in key_tokens:
            text = re.sub(token, ' ' + token + ' ', text)
            text = re.sub('\s+', ' ', text)
    # remove websites
    text = re.sub('https:\S*', '', text)
    print(text)

twitter_preprocess(text)

find out about how donald trump lied at 
