# Pre-process Text

## Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
from sklearn.model_selection import train_test_split
import random

## Functions

### Helper Functions - Data Cleaning

In [3]:
def _remove_write_fail(tweets):
    """ Remove Sentences that have an issue writing to a file
    Args:
        tweets (list): Tweets to clean
    Returns:
        clean_tweets (list): Cleaned tweet list
        removed (list): List of Removed Tweets
    """
    removed = []
    clean_tweets = []
    with open("temp.txt",'w') as out_file:
        for tweet in tweets:
            tweet = str(tweet)
            try:
                out_file.write(tweet)
                clean_tweets.append(tweet)
            except:
                removed.append(tweet)
    return clean_tweets, removed

In [4]:
def _remove_urls(tweets, replace_token='<URL>'):
    """ Replace URLs in text with a replacement token
    Args:
        tweets (list): Tweets to clean
        replace_token (str): String to replace URL
    Returns:
        clean_tweets (list): Cleaned tweet list
        removed (list): List of Removed Tweets
    """
    removed = []
    clean_tweets = []
    for tweet in tweets:
        try:
            clean_tweet = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
                          replace_token, tweet, flags=re.MULTILINE)
            clean_tweets.append(clean_tweet)
        except TypeError:
            removed.append(tweet)
    return clean_tweets, removed

In [5]:
def _remove_RT(tweets):
    """ Remove Retweets
    Args:
        tweets (list): Tweets to clean
    Returns:
        clean_tweets (list): Cleaned tweet list
        removed (list): List of Removed Tweets
    """
    removed = []
    clean_tweets = []
    for tweet in tweets:
        try:
            if 'RT' ==  tweet.split()[0]:
                removed.append(tweet)
            else: clean_tweets.append(tweet)
        except (TypeError, AttributeError):
            pass
    return clean_tweets, removed

In [6]:
def _remove_at_Trump(tweets, debug=True):
    """ Remove Tweets from others using @realdonaldtrump
    Args:
        tweets (list): Tweets to clean
    Returns:
        clean_tweets (list): Cleaned tweet list
        removed (list): List of Removed Tweets
    """
    clean_tweets = []
    android, iphone, blackberry, web = [], [], [], []
    for tweet in tweets:
        if "Twitter for Android" in tweet:
            android.append(tweet)
        elif "Twitter for iPhone" in tweet:
            iphone.append(tweet)
        elif "Twitter Web Client" in tweet:
            web.append(tweet)
        elif "Twitter for BlackBerry" in tweet:
            blackberry.append(tweet)
        else:
            clean_tweets.append(tweet)
    removed = android + iphone + blackberry + web
    if debug:
        print("Android:",len(android),"iPhone:",len(iphone),"Web:",len(web),"Blackberry:",len(blackberry))
    return clean_tweets, removed  

### Helper Functions - File Read/Write

In [7]:
def _txt_file_to_sents(path, debug=True):
    """ (Helper) Convert txt file into a list of sentences
    Args:
        path (str): Path to txt file
        debug (bool): Whether to print information
    Returns:
        sents (list): List of sentences in corpus
    """
    f = open(path, "r", errors="ignore")
    text = f.read()
    f.close()
    #nltk.download('punkt')
    sents = nltk.sent_tokenize(text)
    if debug: print("Converted", path.split('/')[-1],"| Sentences: ", len(sents))
    return sents

In [64]:
def _sents_to_txt(out_path, sents, debug=True, new_line=True):
    """ (Helper) Write an array of sentences to a text file
    Args:
        out_path (str): Path to write txt file
        sents (list): List of sentences in corpus
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
    """
    with open(out_path,'w') as out_file:
        for sentence in sents:
            sentence = sentence+'\n' if new_line else sentence
            out_file.write(sentence)

In [50]:
def _sents_to_train_dev_txt(sents, file_name, debug=True, new_line=True, random_state=42,
                           returns=False, test_size=0.12):
    """ (Helper) Convert a list of sentences to a train and dev set
    Args:
        sents (list): List of sentences in corpus
        file_name (str): "positive" or "negative"
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
        random_state (int): random state for data split
        returns (bool): Whether to return train and dev sets
        test_size (float): Percentage in dev set
    Returns:
        train (list): Train set
        dev (list): Dev set
    """
    train, dev, _, _ = train_test_split(sents, np.zeros(len(sents)),
                                            test_size=test_size, 
                                        random_state=random_state)
    dev, test, _, _ = train_test_split(dev, np.zeros(len(dev)),
                                            test_size=0.0833, 
                                        random_state=random_state) 
    if debug: print("Num Train:", len(train), "| Num Dev:", len(dev), "| Num Test:", len(test))
    _sents_to_txt("../data/text_data/train/" + file_name + ".txt", train, debug=debug, new_line=new_line)
    _sents_to_txt("../data/text_data/dev/" + file_name + ".txt", dev, debug=debug, new_line=new_line)
    _sents_to_txt("../data/text_data/test/" + file_name + ".txt", test, debug=debug, new_line=new_line)
    if returns: return train, dev, test

In [41]:
def _gen_full_text_txt():
    base_path = '../data/yelp/'
    paths = ['train/positive.txt', 'train/negative.txt', 'dev/positive.txt', 'dev/negative.txt']
    sents = []
    for path in paths:
        f = open(base_path + path, "r", errors="ignore")
        text = f.readlines()
        sents.extend(text)
        f.close()
    _sents_to_txt("../data/yelp/full_text.txt", sents)

### Generate Original Dataset Sentences

#### English Data

In [16]:
from nltk.corpus import brown, gutenberg, inaugural, reuters
def gen_en_text_orig():
    """ Generate the orig en_text.txt file in the orig folder
    """
    en_text = open('../data/text_data/orig/en_all_data.txt', 'w')
#     # Reuters
#     for file_id in reuters.fileids():
#         file_sents = reuters.sents(file_id)
#         for sent in file_sents:
#             new_sent = ' '.join(sent)
#             en_text.write(new_sent + '\n')    
    # Gutenberg
    for file_id in gutenberg.fileids():
        if file_id == 'bible-kjv.txt' or 'shakespeare' in file_id:
            continue
        file_sents = gutenberg.sents(file_id)
        for sent in file_sents:
            new_sent = ' '.join(sent)
            en_text.write(new_sent + '\n')
    # Brown
    for file_id in brown.fileids():
        file_sents = brown.sents(file_id)
        for sent in file_sents:
            new_sent = ' '.join(sent)
            en_text.write(new_sent + '\n')
    # Inaugural
    for file_id in brown.fileids():
        if file_id == '2017-Trump.txt':
            continue
        file_sents = brown.sents(file_id)
        for sent in file_sents:
            new_sent = ' '.join(sent)
            en_text.write(new_sent + '\n')
    en_text.close()

### Clean Original Dataset Sentences

#### Trump Data

In [12]:
def _clean_tweets(tweets, debug=True):
    """ Main Helper function to clean Tweets
    Args:
        tweets (list): Tweets to clean
        debug (bool): Whether to print information
    Returns:
        tweets (list): Cleaned tweet list
    """
    tweets, del_write_test = _remove_write_fail(tweets)
    tweets, del_url = _remove_urls(tweets)
    tweets, del_RT = _remove_RT(tweets)
    tweets, del_at_trump = _remove_at_Trump(tweets, debug=debug)
    if debug:
        print("Del Tweets (Write-Test):", len(del_write_test))
        print("Del Tweets (URL processing):", len(del_url))
        print("Del Tweets (RT processing):", len(del_RT))
        print("Del Tweets (Tweets @ Trump):", len(del_at_trump))
    return tweets
    
def _gen_trump_sents(debug=True):
    """ Helper function to generate Trump data sentences
    Args:
        debug (bool): Whether to print information
    """
    rally_speeches = _txt_file_to_sents("orig/trump_10_2016_rally_speeches_orig.txt", debug=debug)
    other_speeches = _txt_file_to_sents("orig/trump_speeches.txt", debug=debug)
    tweet = pd.read_csv("orig/trump_tweets_orig.csv")
    raw_tweets = list(tweet.text)
    clean_tweets = _clean_tweets(raw_tweets, debug=debug)
    sents = rally_speeches + other_speeches + clean_tweets
    if debug:
        print("Num Final Sentences:", len(sents))
    return sents

def gen_trump_train_dev(debug=True, new_line=False, random_state=42, returns=True, test_size=0.12):
    """ Main function to generate train and dev set for Trump Data
    Args:
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
        random_state (int): random state for data split
        returns (bool): Whether to return train and dev sets
        test_size (float): Percentage in dev set
    Returns:
        train (list): Train set
        dev (list): Dev set
    """
    sents = _gen_trump_sents(debug=debug)
    _sents_to_txt('orig/trump_all_data.txt', sents)
    train, dev = _sents_to_train_dev_txt(sents, file_name="negative", debug=debug, new_line=new_line,
                                        random_state=random_state, returns=returns, test_size=test_size)
    
    if returns: return train, dev

#### English Data

In [36]:
def _gen_en_sents(k = 62399):
    """ Helper function to generate subsample of en sentences
    Args:
        k (int): Subsample size
    """
    gen_en_text_orig()
    f = open("../data/text_data/orig/en_text_NEW.txt", "r")
    sents = f.readlines()
    f.close()
    np.random.shuffle(sents)
    #unused_sents = sents[k:]
    #_sents_to_txt("../data/yelp/test/positive.txt", unused_sents, debug=True, new_line=False)
    return sents[:k]

def gen_en_train_dev(k, debug=True, new_line=False, random_state=42, returns=True, test_size=0.12):
    """ Main function to generate train and dev set for En Data
    Args:
        k (int): Subsample size
        debug (bool): Whether to print information
        new_line (bool): Whether to add a newline char to sentences
        random_state (int): random state for data split
        returns (bool): Whether to return train and dev sets
        test_size (float): Percentage in dev set
    Returns:
        train (list): Train set
        dev (list): Dev set
    """
    sents = _gen_en_sents(k = k)
    if debug: print("Num Final Sentences:", len(sents))
    _sents_to_txt('../data/text_data/orig/en_all_data.txt', sents)
    train, dev, test = _sents_to_train_dev_txt(sents, file_name="positive", debug=debug, test_size=test_size,
                                         random_state=random_state, new_line=new_line, returns=returns)
    if returns: return train, dev, test

## Workflow

In [55]:
#trump_train, trump_dev = gen_trump_train_dev(debug=True)

In [76]:
sents = _txt_file_to_sents('../data/text_data/orig/trump_all_data.txt', debug=True)

Converted trump_all_data.txt | Sentences:  84320


In [77]:
#sents

In [78]:
sents = [sent.lower() for sent in sents]

In [79]:
sents = [sent.rstrip("\n").replace('\n\n','\n').replace('\n\n','\n') for sent in sents]

In [80]:
#_sents_to_txt('../data/text_data/orig/trump_all_data_sent_per_line_lower.txt', sents)
train, dev, test = _sents_to_train_dev_txt(sents, file_name="negative",debug=True, new_line=True,
                                     random_state=42, returns=True, test_size=0.12)

Num Train: 74201 | Num Dev: 9276 | Num Test: 843


In [43]:
#en_train, en_dev = gen_en_train_dev(k = 84320, debug=True)

In [67]:
sents = _txt_file_to_sents('../data/text_data/orig/en_all_data.txt', debug=True)
sents = [sent.lower() for sent in sents]
sents = sents[:84320]

Converted en_all_data.txt | Sentences:  84353


In [72]:
sents = [sent.rstrip("\n").replace('\n\n','\n').replace('\n\n','\n') for sent in sents]

In [75]:
#_sents_to_txt('orig/en_all_data_lower.txt', sents)
train, dev, test = _sents_to_train_dev_txt(sents, file_name="positive",debug=True, new_line=True,
                                     random_state=42, returns=True, test_size=0.12)

Num Train: 74201 | Num Dev: 9276 | Num Test: 843


In [42]:
# Update File used to build vocab
_gen_full_text_txt()