# Pre-process Text

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
from sklearn.model_selection import train_test_split
import random

## Functions

### Data Cleaning

In [2]:
def remove_urls (text, replace_token='<URL>', debug=False):
    """ Replace URLs in text with a replacement token
    Args:
        text (str): Text to clean
        replace_token (str): String to replace URL
        debug (bool): Whether to print information
    Returns:
        text (str): Cleaned string or original if it fails
    """
    try:
        return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
                      replace_token, text, flags=re.MULTILINE)
    except TypeError:
        if debug: print("URL REMOVAL FAILED for text:", text)
        return text

### File Read/Write

In [3]:
def txt_file_to_sents(path, debug=True):
    """ Convert txt file into a list of sentences
    Args:
        path (str): Path to txt file
        debug (bool): Whether to print information
    Returns:
        sents (list): List of sentences in corpus
    """
    f = open(path, "r")
    text = f.read()
    f.close()
    #nltk.download('punkt')
    sents = nltk.sent_tokenize(text)
    if debug: print("Converted", path.split('/')[-1],"| Sentences: ", len(sents))
    return sents

In [14]:
def _sents_to_txt(out_path, sents, debug=True, new_line=True):
    """ Write an array of sentences to a text file
    Args:
        out_path (str): Path to write txt file
        sents (list): List of sentences in corpus
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
    """
    fail_sents = []
    with open(out_path,'w') as out_file:
        for sentence in sents:
            sentence = str(sentence)
            try:
                sentence = sentence+'\n' if new_line else sentence
                out_file.write(sentence)
            except UnicodeEncodeError:
                fail_sents.append(sentence)
    if debug:
        print("Num Sentences Failed to Write: ", len(fail_sents))
        #for i in fail_sents: print(i)

In [5]:
def _sents_to_train_dev_txt(sents, file_name, debug=True, new_line=True, random_state=42,
                           returns=False, test_size=0.12):
    """ Convert a list of sentences to a train and dev set
    Args:
        sents (list): List of sentences in corpus
        file_name (str): "positive" or "negative"
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
        random_state (int): random state for data split
        returns (bool): Whether to return train and dev sets
        test_size (float): Percentage in dev set
    Returns:
        train (list): Train set
        dev (list): Dev set
    """
    train, dev, _, _ = train_test_split(sents, np.zeros(len(sents)),
                                            test_size=test_size, 
                                        random_state=random_state)
    if debug: print("Num Train:", len(train), "| Num Dev:", len(dev))
    _sents_to_txt("train/" + file_name + ".txt", train, debug=debug, new_line=new_line)
    _sents_to_txt("dev/" + file_name + ".txt", dev, debug=debug, new_line=new_line)
    if returns: return train, dev

### Generate Dataset Sentences

In [6]:
from nltk.corpus import gutenberg
from nltk.corpus import brown
def gen_en_text_orig():
    """ Generate the orig en_text.txt file in the orig folder
    """
    # Gutenberg
    for file_id in gutenberg.fileids():
        en_text = open('orig/en_text.txt', 'a')
        file_sents = gutenberg.sents(file_id)
        for sent in file_sents:
            new_sent = ' '.join(sent)
            en_text.write(new_sent + '\n')
        en_text.close()
    # Brown
    for file_id in brown.fileids():
        en_text = open('orig/en_text.txt', 'a')
        file_sents = brown.sents(file_id)
        for sent in file_sents:
            new_sent = ' '.join(sent)
            en_text.write(new_sent + '\n')
        en_text.close()

In [17]:
def _gen_trump_sents(debug=True):
    rally_speeches = txt_file_to_sents("orig/trump_10_2016_rally_speeches_orig.txt", debug=debug)
    tweets_df = pd.read_csv("orig/trump_tweets_orig.csv")
    tweets_df['cleaned_text'] = tweets_df.apply(lambda row : remove_urls(row['text']), axis = 1)
    sents = rally_speeches + list(tweets_df['cleaned_text'])
    if debug: print("Num Sentences:", len(sents))
    return sents

def gen_trump_train_dev(debug=True, new_line=False, random_state=42, returns=True, test_size=0.12):
    """ Main function to generate train and dev set for Trump Data
    Args:
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
        random_state (int): random state for data split
        returns (bool): Whether to return train and dev sets
        test_size (float): Percentage in dev set
    Returns:
        train (list): Train set
        dev (list): Dev set
    """
    sents = _gen_trump_sents(debug=debug)
    train, dev = _sents_to_train_dev_txt(sents, file_name="negative", debug=debug, new_line=new_line,
                                        random_state=random_state, returns=returns, test_size=test_size)
    if returns: return train, dev

In [26]:
def _gen_en_sents(k = 56857):
    f = open("orig/en_text.txt", "r")
    sents = f.readlines()
    f.close()
    return random.sample(sents, k)

def gen_en_train_dev(k = 56857, returns=True):
    sents = _gen_en_sents(k = k)
    print("Num Sentences:", len(sents))
    train, dev = _sents_to_train_dev_txt(sents, file_name="positive", test_size=0.12, random_state=42,
                                         new_line=False, returns=True)
    if returns: return train, dev

## Workflow

In [19]:
trump_train, trump_dev = gen_trump_train_dev(debug=True)

Converted trump_10_2016_rally_speeches_orig.txt | Sentences:  16493
Num Sentences: 56857
Num Train: 50034 | Num Dev: 6823
Num Sentences Failed to Write:  625
Num Sentences Failed to Write:  87


In [27]:
en_train, en_dev = gen_en_train_dev()

Num Sentences: 56857
Num Train: 50034 | Num Dev: 6823
Num Sentences Failed to Write:  0
Num Sentences Failed to Write:  0
