### Notebook for building the text pre-processing pipeline

In [5]:
import pandas as pd
from datetime import datetime

## Recode PrepDocs

In [53]:
spacy.load('en', disable = ['tagger', 'parser', 'ner', 'textcat'])

<spacy.lang.en.English at 0x7fa407fa7438>

In [35]:
import spacy
import string
import os
import re
class PrepDocs:
    """
    Class for either tokenizing or lemmatizing a corpus of documents

    self.clean_text will do a basic cleaning that returns lower case, removes
    stop words, and removes punctuation

    slef.lemmatize_text will do the basic cleaning plus lemmatization

    input_dir: directory where text documents to be cleaned are stored
    output_dir: directory where cleans documents are to be saved
    """
    def __init__(self, model = 'en', disabled = ['tagger', 'parser', 'ner', 'textcat']):
        self.nlp = spacy.load(model, disable = disabled)
        self.stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)
        self.stopwords.remove()
        self.not_allowed = string.punctuation + string.digits

    def spacy_tokenizer(self, text):
        """
        Called by the clean_file method
        Tokenizes, returns lower case, removes stop words and punctuation,
        """
        # drop subsection numbers
        text = re.sub(r"\d\.", "", text)
        # tokenize
        text = self.nlp(text)
        # take the lemma unless it is a pronoun
        text = [tok.text.lower().strip() for tok in text]
        # drop digits
        text = [token for token in text if not token.isdigit()]
        # drop stopwords and punctuation
        text = [tok for tok in text if (tok not in self.stopwords and tok not in self.not_allowed)]
        # rejoin the tokens to a single string
        text = " ".join(text)
        return text

    def clean_file(self, doc):
        """
        Used for cleaning a .txt file
        """
        text = open(doc, errors = "ignore").read()
        text = self.spacy_tokenizer(text)
        text_file = open(self.output_dir + "/" + doc[0:-4] + '_clean' '.txt',
                             'w', errors='replace')
        text_file.write(text)

    def spacy_lemmatizer(self, text):
        """
        Called by the lemmatize_file method
        Tokenizes, lemmatizes, returns lower case, removes stop words and punctuation
        """
        # reload spacy with pat of speech tagger
        nlp = spacy.load('en', disable = ['parser', 'ner', 'textcat'])
        # drop subsection numbers
        text = re.sub(r"\d\.", "", text)
        # tokenize
        text = nlp(text)
        # drop stopwords
        # take the lemma unless it is a stopword
        text = [tok.lemma_.lower().strip() for tok in text if (tok.text not in self.stopwords and tok.text not in self.not_allowed)]
        # drop digits
        text = [token for token in text if not token.isdigit()]
        # rejoins the tokens to a single string
        text = " ".join(text)
        return text

    def lemmatize_file(self, doc):
        """
        Used for lemmatizing a .txt file
        """
        text = open(self.input_dir + "/" + doc, errors = "ignore").read()
        text = self.spacy_lemmatizer(text)
        text_file = open(self.output_dir + "/" + doc[0:-4] + '_clean' '.txt',
                             'w', errors='replace')
        text_file.write(text)

## Check Lemmas

In [146]:
prep = PrepDocs('none', 'none')
prep.spacy_lemmatizer(text = 'we need more scotus\'s')

'need scotus'

### The following terms need to be converted via regular expressions:
synonyms should be converted to a single term, words that won't be located in both the congressional and troll data set should be eliminated
In order to catch compound words using my key tokens such as "moscowmitch" replace each "token" with " token " via regular expression


impeachment > impeach

bernie sanders > sanders

bernie > sanders

abort > abortion

global warming > climate change

climate change > climatechange

electoral college > electoralcollege

pence > mikepence

mike pence > mikepence

citizens united > citizensunited

lgbt > lgbtq

gay > lgbtq

lesbian > lgbtq

homosexual > lgbtq

supreme court > scotus

witch hunt > witchhunt

north korea > northkorea

saudi arabia > saudi

media > newsmedia

corruption > corrupt

obamacare > ACA

affordable care act > ACA

donald trump > trump

In [71]:
terms = pd.read_csv('Political terms.csv', header = None)
terms.tail(20)

Unnamed: 0,0
80,constitution
81,federal
82,syria
83,north korea
84,saudi arabia
85,gop
86,mexico
87,debt
88,fiscal
89,oil


In [59]:
for i in terms[0]:
    print(prep.spacy_lemmatizer(text = i))

trump
president
healthcare
border
wall
democrat
republican
liberal
conservative
abortion abort
clinton
sander
socialist
economy
job
impeachment
potus
obama
russia
mueller
collusion
military
budget
market
trade
vote
democracy
gun
firearm
assualt rifle
agriculture
woman
business
tax
medicare
police
immigration
insurance
climate
corrupt
progressive
electoral college
judge
court
gerrymander
pelosi
mcconnell
penny
citizen unite
dreamer
lgbtq
obamacare
scotus
partisan
patriot
welfare
privilege
minority
muslim
god
religion
administration
politic

fair
witch hunt
warren
biden
security
terrorism
pentagon
senate
rich
american
church
science
supreme court
stock
congress
white house
constitution
federal
syria
north korea
saudi arabia
gop
mexico
debt
fiscal
oil
medium
news
racist
refugee
education
maga
mitch
campaign
party
poll
