# 0. Cleaning the Data

- Remove all punctuation.

- Remove all non-alphanumeric characters.

- Convert all symbols to lowercase format. 

In [12]:
import numpy as np
from math import log
import re

class Token:

    def __init__(self, word, f0):
        self.token = word
        self.a_count = f0
        self.b_count = f0
    
def data_cleaner(path, stop_word_path=None): # author Walter and Miguel
    stop_words = None
    
    if stop_word_path:
        print(F"Recurring on {stop_word_path}")
        stop_words = data_cleaner(stop_word_path)
    
#     print(stop_words)
    print("No Stop Words Provided")
    text = "" # define empty text variable
    with open(path, newline='\n') as file: # open text file and read by line
        temp_text = file.readlines() # store read lines in temporary string

    linewordlist = [] # empty list for lines, each line in this list is also a list of words
    for lineno, linedat in enumerate(temp_text): 
        linedat = linedat.strip('''!()-[]{};:'"\, <>./?@#$%^&*_~''') # take out all special trailing characters
        linedat = linedat.lower() # convert all to lowercase
        linewordlist.extend(linedat.split())
    
    linewordlist = np.array(linewordlist)
    
    if stop_words is not None:
        words_removed = 0
        print(F"Attempting to remove {len(stop_words)} Stop Words")
        for stop_word in stop_words:
            linewordlist = linewordlist[linewordlist != stop_word]
        
    for word in linewordlist: # going through each line in the file
        word = re.sub(r'\W+', '', word)  # Apply regular expression to retain all alphanumeric chars.
        text += word + "\n" # append word to the cleaned text string, with a newline
    # write text data to file
   
    file = open("cleaned.txt", "w")
    file.write(text)
    file.close()
    return linewordlist

def wordcount_dict(filepath, fileclass, tokendict, f0=1): # author Walter, final modifications Miguel
#     tokendict = {} # create new empty dictionary
    with open(filepath) as file: # open file
        for line in file: # iterate through all lines (words) in the file
            line = line.strip('\n') # remove the trailing newlines
            if line not in tokendict: # check to create a new dictionary entry if token not already in dict
                token = Token(line, f0)  # Since no token exists, create a new token as entry
                tokendict.update({line: token}) 
                                  
            # elif line in tokendict: # otherwise, if it does exist
            #     count = tokendict.get(line) # take the existing count
            #     count += 1 # iterate by 1
            #     tokendict.update({line : count}) # update in the dictionary
           
            if fileclass == 'a':  # If the file belongs to class `a`
                token = tokendict[line]
                token.a_count += 1  # Update count for `a`, `a_count`
                tokendict[line] = token  # Update tokendict.
            
            elif fileclass == 'b':  # If file belongs to class `b`
                token = tokendict[line]
                token.b_count += 1  # Update count for `b`, `b_count`
                tokendict[line] = token  # Update tokendict.
    # print(tokendict) # print out the dictionary
    return tokendict

In [13]:
class Classifier:  # author Miguel
    def __init__(self):
        self.A = 0
        self.B = 0
        
    def classify(self, apath, bpath, f0, dpath, tokens=dict(), path_stops=None):  # authors Walter and Miguel
        if f0 < 1:
            return -228  # just chose a fun number, if f0 does not match requirements, return error
        
        tokens = wordcount_dict(apath, 'a', tokens, f0)  # populate dictionary with a document data
        tokens = wordcount_dict(bpath, 'b', tokens, f0)  # populate dictionary with a document data
        data_words = data_cleaner(dpath, stop_word_path="data/stop_words.txt")  # clean the test document
        for word in data_words:  # check all words in test document
            if word in tokens.keys():  # compare to dictionary, increment as appropriate
                self.A += log(tokens[word].a_count)  # increment A if A found
                self.B += log(tokens[word].b_count)  # increment B if B found
        
        if self.A > self.B:
            return F"a, {self.A}, {self.B}"
        elif self.A < self.B:
            return F"b, {self.A}, {self.B}"
        elif self.A == self.B:
            return F"ab, {self.A}, {self.B}"

In [14]:
phase = input("Phase: ")
if phase == '1':
    path = input("Path to data file: ")
    data_cleaner(path, stop_word_path="data/stop_words.txt")
    pass

elif phase == '2':
    path = input("Path to cleaned text file: ")
    tokendict = wordcount_dict(path, 'a')
    pass

elif phase == '3':
    classifier = Classifier()
    path_stops = input("Path to text file of Stop Words: ") or "data/stop_words.txt"
    path_a = input("Path to text file of Class A Docs: ") or "data/A/Combined.txt"
    path_b = input("Path to text file of Class B Docs: ") or "data/B/Combined.txt"
    f_0 = int(input("Please enter a positive non-zero number for f_0: ") or '1')
    path_d = input("Path to text file of document to classify: ") or "data/test_speeches/obama/a223.txt"
    dictionary = {}
    print("\nRunning Classifier. 'a' - Republican Speaker. 'b' - Democrat Speaker")
    print(F"A_PATH: {path_a}\nB_PATH: {path_b}\nD_PATH: {path_d}")
    document_class = classifier.classify(path_a, path_b, f_0, path_d, dictionary, path_stops=path_stops) 
    print(document_class)
    pass

else:
    print(F"Sorry, phase {phase} has not yet been implemented")



Phase: 3
Path to text file of Stop Words: 
Path to text file of Class A Docs: 
Path to text file of Class B Docs: 
Please enter a positive non-zero number for f_0: 
Path to text file of document to classify: 

Running Classifier. 'a' - Republican Speaker. 'b' - Democrat Speaker
A_PATH: data/A/Combined.txt
B_PATH: data/B/Combined.txt
D_PATH: data/test_speeches/obama/a223.txt
Recurring on data/stop_words.txt
No Stop Words Provided
No Stop Words Provided
Attempting to remove 851 Stop Words
a, 8828.558262166202, 8448.489085748668
