# Pre-process Trump Tweets

## Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
from sklearn.model_selection import train_test_split

## Functions

### Data Cleaning

In [3]:
# Replace URLs in text with a replacement token
def remove_urls (text, replace_token='<URL>'):
    try:
        return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', replace_token, text, flags=re.MULTILINE)
    except:
        print("URL REMOVAL FAILED for text:", text)
        return text

### File Read/Write

In [4]:
# Convert txt file into an array of sentences
def txt_file_to_sents(path):
    f = open(path, "r")
    text = f.read()
    f.close()
    #nltk.download('punkt')
    sents = nltk.sent_tokenize(text)
    print("Converted", path.split('/')[-1],"| Sentences: ", len(sents))
    return sents

In [5]:
# Write an array of sentences to a text file
def sents_to_txt(out_path, sents, print_failed=False):
    fail_sents = []
    with open(out_path,'w') as output_file:
        for sentence in sents:
            try:
                output_file.write(sentence+'\n')
            except:
                fail_sents.append(sentence)
    if print_failed:
        print("Num Sentences Failed to Write: ", len(fail_sents))
        for i in fail_sents: print(i)

In [16]:
def sents_to_train_dev_txt(sents, prefix, test_size=0.12, random_state=42, ret=False):
    train, dev, _, _ = train_test_split(sents, np.zeros(len(sents)),
                                            test_size=test_size, random_state=random_state)
    print("Num Train:", len(train), "| Num Dev:", len(dev))
    sents_to_txt("train/" + prefix + ".txt", train)
    sents_to_txt("dev/" +prefix + ".txt", dev)
    if ret: return train, dev 

### Generate Dataset Sentences

In [14]:
from nltk.corpus import gutenberg
from nltk.corpus import brown
def gen_en_text_orig():
    # Gutenberg
    for file_id in gutenberg.fileids():
        en_text = open('orig/en_text.txt', 'w')
        file_sents = gutenberg.sents(file_id)
        for sent in file_sents:
            new_sent = ' '.join(sent)
            en_text.write(new_sent + '\n')
        en_text.close()
    # Brown
    for file_id in brown.fileids():
        en_text = open('orig/en_text.txt', 'a')
        file_sents = brown.sents(file_id)
        for sent in file_sents:
            new_sent = ' '.join(sent)
            en_text.write(new_sent + '\n')
        en_text.close()

In [20]:
def gen_trump_sents(prefix="negative", ret=True):
    rally_speeches = txt_file_to_sents("orig/trump_10_2016_rally_speeches_orig.txt")
    tweets_df = pd.read_csv("orig/trump_tweets_orig.csv")
    tweets_df['cleaned_text'] = tweets_df.apply(lambda row : remove_urls(row['text']), axis = 1)
    sents = rally_speeches + list(tweets_df['cleaned_text'])
    print("Num Sentences:", len(sents))
    train, dev = sents_to_train_dev_txt(sents, prefix, test_size=0.12, random_state=42, ret=ret)
    if ret: return train, dev

## Workflow

In [21]:
trump_train, trump_dev = gen_trump_sents()

Converted trump_10_2016_rally_speeches_orig.txt | Sentences:  16493
URL REMOVAL FAILED for text: nan
Num Sentences: 56857
Num Train: 50034 | Num Dev: 6823


In [15]:
gen_en_text_orig()