In [15]:
from nltk.stem.porter import PorterStemmer
import nltk
import re
from num2words import num2words
import plotly.graph_objects as go

In [24]:
def preprocess(text):
    text = text.lower()
    text = remove_apostrophes(text)
    text = remove_punctuation(text)
    text = remove_single_chars(text)
    text = remove_links(text)
    text = remove_aux_verbs(text)
    text = remove_search_terms(text)
    text = numbers_to_words(text)
    text = tokenize(text)
    text = remove_stop_words(text)
    text = stemmer(text)
    
    return text

['achowardwriter', 'tappytapin', 'mainerbee', 'naomiaklein', 'whats', 'like', 'youre', 'expert', 'politician', 'done', 'favored', 'politician', 'done', 'make', 'better', 'country', 'world']


In [33]:
from nltk.corpus import stopwords
def remove_stop_words(text):
    stop_words = ['myself', 'our', 'ours', 'ourselves', 'your', 'yours',
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
    "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself',
    'them', 'their', 'theirs', 'themselves', 'what', 'which',
    'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am',
    'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
    'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the',
    'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
    'through', 'during', 'before', 'after', 'above', 'below', 'to',
    'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
    'again', 'further', 'then', 'once', 'here', 'there', 'when', 
    'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
    'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
    'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
    'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
    've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
    'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't",
    'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",
    'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

    filtered_words = [word for word in text if word not in stop_words]
    return filtered_words

['I', 'They', 'We', 'My', 'Us', 'You', 'Me']


In [4]:
def numbers_to_words(text):
    num_list = re.findall(r'\d+', text)
    str_num_list = []

    for num in num_list:
            str_with_punct = num2words(num)
            str_without_punct = re.sub('[^A-Za-z0-9 ]+', '', str_with_punct)
            str_num_list.append(str_without_punct)
    
    for i, num in enumerate(num_list):
        text = text.replace(str(num), str_num_list[i])
  
    return text

In [5]:
def remove_single_chars(text):
    txt = text.split()
    updated_text = ""
    for word in txt:
        if len(word) > 1 or word.lower() == "i":
            updated_text = updated_text + " " + word
    return updated_text

In [6]:
# Remove Concordance
def remove_concordance(filename, pronoun):
    lines = ""
    with open(filename, 'r', encoding='utf8') as f:
        for line in f.readlines():
                # Takes the combined line from the concordance.
                line = line[line.find("line="):].strip("line=")
                line = re.sub('[^A-Za-z0-9\' ]+', '', line)
                lines += line
    # Preprocesses each line and adds to new file
    writeToFile('../textFiles/Pronoun/clean/stripped-concordance-' + pronoun + '.txt', preprocess(lines))

In [7]:
# Remove apostrophes
def remove_apostrophes(text):
    text = text.replace("'", "")
    return text

In [8]:
# Remove Auxillary Verbs
def remove_aux_verbs(stripped_string):
    aux_verbs = ["have","be","been","being"]
    list_string = stripped_string.split()

    cleaned_words = [word for word in list_string if word.lower() not in aux_verbs]
    results = ' '.join(cleaned_words)
    return results

In [9]:
# Remove Links
def remove_links(text):
    text = re.sub(r'http\S+', '', text)
    return text

In [10]:
# Tokenizes words in document text
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens

In [21]:
# Stems tokenized words
from nltk.stem import WordNetLemmatizer
 

def stemmer(text):
    lemmatizer = WordNetLemmatizer()
    stems = []
    for item in text:
        stems.append(lemmatizer.lemmatize(item))
        #stems.append(PorterStemmer().stem(item))
    return stems

In [12]:
def remove_punctuation(text):
    words = []
    for word in text:
        w = re.sub(r'([^\w\s\d]|_)','',word)
        words.append(w)
    words = "".join(words)

    return words

In [13]:
# Remove Search Terms
def remove_search_terms(stripped_string):
    stripped_string = stripped_string.replace("conspiracy", "").strip()
    stripped_string = stripped_string.replace("propaganda", "").strip()
    stripped_string = stripped_string.replace("Trump", "").strip()
    stripped_string = stripped_string.replace("fake", "").strip()
    stripped_string = stripped_string.replace("news", "").strip()
    stripped_string = stripped_string.replace("realDonald", "").strip()
    stripped_string = stripped_string.replace("misinformation", "").strip()
    stripped_string = stripped_string.replace("fake news", "").strip()
    stripped_string = stripped_string.replace("disinformation", "").strip()
    stripped_string = stripped_string.replace("active measures", "").strip()
    stripped_string = stripped_string.replace("subversion", "").strip()
    stripped_string = stripped_string.replace("interference", "").strip()
    stripped_string = stripped_string.replace("influence", "").strip()
    stripped_string = stripped_string.replace("deep state", "").strip()
    stripped_string = stripped_string.replace("fabrication", "").strip()
    stripped_string = stripped_string.replace("manipulate", "").strip()
    stripped_string = stripped_string.replace("deceive", "").strip()
    stripped_string = stripped_string.replace("useful idiots", "").strip()
    stripped_string = stripped_string.replace("mainstream media", "").strip()
    stripped_string = stripped_string.replace("populism", "").strip()
    stripped_string = stripped_string.replace("untrustworthy", "").strip()
    stripped_string = stripped_string.replace("hoax", "").strip()
    stripped_string = stripped_string.replace("made-up", "").strip()
    stripped_string = stripped_string.replace("bogus", "").strip()
    stripped_string = stripped_string.replace("inaccurate", "").strip()
    stripped_string = stripped_string.replace("doctored", "").strip()
    stripped_string = stripped_string.replace("fact Checking", "").strip()
    stripped_string = stripped_string.replace("eu false", "").strip()
    stripped_string = stripped_string.replace("eu fraud", "").strip()
    stripped_string = stripped_string.replace("eu hoax", "").strip()
    stripped_string = stripped_string.replace("eu lies", "").strip()
    stripped_string = stripped_string.replace("eu rumours", "").strip()
    stripped_string = stripped_string.replace("eu troll", "").strip()
    stripped_string = stripped_string.replace("europe false", "").strip()
    stripped_string = stripped_string.replace("europe fraud", "").strip()
    stripped_string = stripped_string.replace("europe hoax", "").strip()
    stripped_string = stripped_string.replace("europe lies", "").strip()
    stripped_string = stripped_string.replace("europe rumours", "").strip()
    stripped_string = stripped_string.replace("europe troll", "").strip()
    stripped_string = stripped_string.replace("european false", "").strip()
    stripped_string = stripped_string.replace("european fraud", "").strip()
    stripped_string = stripped_string.replace("european hoax", "").strip()
    stripped_string = stripped_string.replace("european lies", "").strip()
    stripped_string = stripped_string.replace("european rumours", "").strip()
    stripped_string = stripped_string.replace("european troll", "").strip()
    return stripped_string