# Pre-process Text

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
from sklearn.model_selection import train_test_split
import random

## Functions

### Helper Functions - Data Cleaning

In [2]:
def _remove_urls(tweets, replace_token='<URL>'):
    """ Replace URLs in text with a replacement token
    Args:
        tweets (list): Tweets to clean
        replace_token (str): String to replace URL
    Returns:
        clean_tweets (list): Cleaned tweet list
        removed (list): List of Removed Tweets
    """
    removed = []
    clean_tweets = []
    for tweet in tweets:
        try:
            clean_tweet = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
                          replace_token, tweet, flags=re.MULTILINE)
            clean_tweets.append(clean_tweet)
        except TypeError:
            removed.append(tweet)
    return clean_tweets, removed

In [3]:
def _remove_RT(tweets):
    """ Remove Retweets
    Args:
        tweets (list): Tweets to clean
    Returns:
        clean_tweets (list): Cleaned tweet list
        removed (list): List of Removed Tweets
    """
    removed = []
    clean_tweets = []
    for tweet in tweets:
        try:
            if 'RT' ==  tweet.split()[0]:
                removed.append(tweet)
            else: clean_tweets.append(tweet)
        except (TypeError, AttributeError):
            pass
    return clean_tweets, removed

### Helper Functions - File Read/Write

In [3]:
def _txt_file_to_sents(path, debug=True):
    """ (Helper) Convert txt file into a list of sentences
    Args:
        path (str): Path to txt file
        debug (bool): Whether to print information
    Returns:
        sents (list): List of sentences in corpus
    """
    f = open(path, "r", errors="ignore")
    text = f.read()
    f.close()
    #nltk.download('punkt')
    sents = nltk.sent_tokenize(text)
    if debug: print("Converted", path.split('/')[-1],"| Sentences: ", len(sents))
    return sents

In [5]:
def _sents_to_txt(out_path, sents, debug=True, new_line=True):
    """ (Helper) Write an array of sentences to a text file
    Args:
        out_path (str): Path to write txt file
        sents (list): List of sentences in corpus
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
    """
    fail_sents = []
    with open(out_path,'w') as out_file:
        for sentence in sents:
            sentence = str(sentence)
            try:
                sentence = sentence+'\n' if new_line else sentence
                out_file.write(sentence)
            except UnicodeEncodeError:
                fail_sents.append(sentence)
    if debug:
        print("Num Sentences Failed to Write: ", len(fail_sents))
        #for i in fail_sents: print(i)

In [6]:
def _sents_to_train_dev_txt(sents, file_name, debug=True, new_line=True, random_state=42,
                           returns=False, test_size=0.12):
    """ (Helper) Convert a list of sentences to a train and dev set
    Args:
        sents (list): List of sentences in corpus
        file_name (str): "positive" or "negative"
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
        random_state (int): random state for data split
        returns (bool): Whether to return train and dev sets
        test_size (float): Percentage in dev set
    Returns:
        train (list): Train set
        dev (list): Dev set
    """
    train, dev, _, _ = train_test_split(sents, np.zeros(len(sents)),
                                            test_size=test_size, 
                                        random_state=random_state)
    if debug: print("Num Train:", len(train), "| Num Dev:", len(dev))
    _sents_to_txt("train/" + file_name + ".txt", train, debug=debug, new_line=new_line)
    _sents_to_txt("dev/" + file_name + ".txt", dev, debug=debug, new_line=new_line)
    if returns: return train, dev

In [7]:
def _gen_full_text_txt():
    base_path = '../yelp/'
    paths = ['train/positive.txt', 'train/negative.txt', 'dev/positive.txt', 'dev/negative.txt']
    sents = []
    for path in paths:
        f = open(base_path + path, "r", errors="ignore")
        text = f.readlines()
        sents.append(text)
        f.close()
    _sents_to_txt("../yelp/full_text.txt", sents)
    

### Generate Original Dataset Sentences

#### English Data

In [8]:
from nltk.corpus import brown, gutenberg, inaugural
def gen_en_text_orig():
    """ Generate the orig en_text.txt file in the orig folder
    """
    en_text = open('orig/en_text_NEW.txt', 'w')
    # Gutenberg
    for file_id in gutenberg.fileids():
        if file_id == 'bible-kjv.txt' or 'shakespeare' in file_id:
            continue
        file_sents = gutenberg.sents(file_id)
        for sent in file_sents:
            new_sent = ' '.join(sent)
            en_text.write(new_sent + '\n')
    # Brown
    for file_id in brown.fileids():
        file_sents = brown.sents(file_id)
        for sent in file_sents:
            new_sent = ' '.join(sent)
            en_text.write(new_sent + '\n')
    # Inaugural
    for file_id in brown.fileids():
        if file_id == '2017-Trump.txt':
            continue
        file_sents = brown.sents(file_id)
        for sent in file_sents:
            new_sent = ' '.join(sent)
            en_text.write(new_sent + '\n')
    en_text.close()

### Clean Original Dataset Sentences

#### Trump Data

In [20]:
def _clean_tweets(tweets, debug=True):
    """ Main Helper function to clean Tweets
    Args:
        tweets (list): Tweets to clean
        debug (bool): Whether to print information
    Returns:
        tweets (list): Cleaned tweet list
    """
    tweets, del_url = _remove_urls(tweets)
    tweets, del_RT = _remove_RT(tweets)
    if debug:
        print("Del Tweets (URL processing):", len(del_url))
        print("Del Tweets (RT processing):", len(del_RT))
    return tweets
    
def _gen_trump_sents(debug=True):
    """ Helper function to generate Trump data sentences
    Args:
        debug (bool): Whether to print information
    """
    rally_speeches = _txt_file_to_sents("orig/trump_10_2016_rally_speeches_orig.txt", debug=debug)
    other_speeches = _txt_file_to_sents("orig/trump_speeches.txt", debug=debug)
    raw_tweets = pd.read_csv("orig/trump_tweets_orig.csv")['text']
    raw_tweets = list(raw_tweets)
    clean_tweets = _clean_tweets(raw_tweets, debug=debug)
    sents = rally_speeches + other_speeches + clean_tweets
    print(clean_tweets[:10])
    if debug: print("Num Sentences:", len(sents))
    return sents

def gen_trump_train_dev(debug=True, new_line=False, random_state=42, returns=True, test_size=0.12):
    """ Main function to generate train and dev set for Trump Data
    Args:
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
        random_state (int): random state for data split
        returns (bool): Whether to return train and dev sets
        test_size (float): Percentage in dev set
    Returns:
        train (list): Train set
        dev (list): Dev set
    """
    sents = _gen_trump_sents(debug=debug)
    _sents_to_txt("trump.txt", sents)
    train, dev = _sents_to_train_dev_txt(sents, file_name="negative", debug=debug, new_line=new_line,
                                        random_state=random_state, returns=returns, test_size=test_size)
    if returns: return train, dev

In [2]:
debug=False
rally_speeches = _txt_file_to_sents("orig/trump_10_2016_rally_speeches_orig.txt", debug=debug)
other_speeches = _txt_file_to_sents("orig/trump_speeches.txt", debug=debug)
raw_tweets = pd.read_csv("orig/trump_tweets_orig.csv")['text']
raw_tweets = list(raw_tweets)
clean_tweets = _clean_tweets(raw_tweets, debug=debug)
sents = rally_speeches + other_speeches + clean_tweets

NameError: name '_txt_file_to_sents' is not defined

In [74]:
android = []
iphone = []
web = []
raw_tweets = pd.read_csv("orig/trump_tweets_orig.csv")['text']
for i  in raw_tweets:
    i = str(i)
    if "Twitter for Android" in i:
        android.append(i)
    elif "Twitter for iPhone" in i:
        iphone.append(i)
    elif "Twitter Web Client" in i:
        web.append(i)
    else:
        pass
print(len(android), len(iphone), len(web), "|total:", len(android) + len(iphone) + len(web))

1287 6 98 |total: 1391


In [71]:
android = []
iphone = []
web = []
for i  in clean_tweets:
    if "Twitter for Android" in i:
        android.append(i)
    elif "Twitter for iPhone" in i:
        iphone.append(i)
    elif "Twitter Web Client" in i:
        web.append(i)
    else:
        pass
print(len(android), len(iphone), len(web), "|total:", len(android) + len(iphone) + len(web))

1287 6 98 |total: 1391


In [1]:
android[0]

NameError: name 'android' is not defined

In [64]:
"Twitter Web Client" in "Twitter Web Client,Boston's Mayor "

True

In [27]:
raw_tweets = pd.read_csv("orig/trump_tweets_orig.csv")['text']
raw_tweets = list(raw_tweets)
for i in raw_tweets: 
    if "Twitter for iPhone" in i: 
        print(i)

The real big story that affects everybody in America is the success of @POTUS's TAX CUT package and what it's done for our economy...” @Varneyco https://t.co/2bUbA7zSFM,06-21-2018 20:25:24,11608,47038,false,1009895079631351808
Media Studio,My Administration is acting swiftly to address the illegal immigration crisis on the Southern Border. Loopholes in our immigration laws all supported by extremist open border Democrats...and that's what they are - they're extremist open border Democrats.... https://t.co/F73I5gu0Q5,06-21-2018 17:02:40,19376,70977,false,1009844059211366401
Twitter for iPhone,Democrats want open Borders where anyone can come into our Country and stay. This is Nancy Pelosi’s dream. It won’t happen!,06-21-2018 14:38:15,25463,104682,false,1009807715798044672
Twitter for iPhone,Henry McMaster has done a great job as Governor of South Carolina. The state is BOOMING with jobs and new industry setting records. He is tough on Crime and Strong on Borders Healthcare the Milita

TypeError: argument of type 'float' is not iterable

In [41]:
raw = list(pd.read_csv("orig/trump_tweets_orig.csv")['text'])


40364

#### English Data

In [10]:
def _gen_en_sents(k = 62399):
    """ Helper function to generate subsample of en sentences
    Args:
        k (int): Subsample size
    """
    gen_en_text_orig()
    f = open("orig/en_text_NEW.txt", "r")
    sents = f.readlines()
    f.close()
    return random.sample(sents, k)

def gen_en_train_dev(k, debug=True, new_line=False, random_state=42, returns=True, test_size=0.12):
    """ Main function to generate train and dev set for En Data
    Args:
        k (int): Subsample size
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
        random_state (int): random state for data split
        returns (bool): Whether to return train and dev sets
        test_size (float): Percentage in dev set
    Returns:
        train (list): Train set
        dev (list): Dev set
    """
    sents = _gen_en_sents(k = k)
    if debug: print("Num Sentences:", len(sents))
    train, dev = _sents_to_train_dev_txt(sents, file_name="positive", debug=debug, test_size=test_size,
                                         random_state=random_state, new_line=new_line, returns=returns)
    if returns: return train, dev

## Workflow

In [21]:
trump_train, trump_dev = gen_trump_train_dev(debug=True)

Converted trump_10_2016_rally_speeches_orig.txt | Sentences:  16493
Converted trump_speeches.txt | Sentences:  7695
Del Tweets (URL processing): 1
Del Tweets (RT processing): 2152
['<URL>', '<URL>', 'Price transparency is so important for the people of our Country. In many ways it will prove to be as important as healthcare itself. A great issue for both Republicans and Democrats. Hopefully it will be approved!', 'Will be doing a Town Hall on @FoxNews at 12:05 P.M. (Now). I will go on around 1:00 P.M. Enjoy!', 'Our people want to return to work. They will practice Social Distancing and all else and Seniors will be watched over protectively &amp; lovingly. We can do two things together. THE CURE CANNOT BE WORSE (by far) THAN THE PROBLEM! Congress MUST ACT NOW. We will come back strong!', 'Congress must approve the deal without all of the nonsense today. The longer it takes the harder it will be to start up our economy. Our workers will be hurt!', 'This is not about the ridiculous Green 

In [12]:
en_train, en_dev = gen_en_train_dev(k = 62399, debug=True)

Num Sentences: 62399
Num Train: 54911 | Num Dev: 7488
Num Sentences Failed to Write:  0
Num Sentences Failed to Write:  0
