In [1]:
import sys
import os
import pandas as pd
import json
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import re
import string

In [2]:
WORK_DIR = "/home/nguyen/"
RAW_DATA = "/data/labeled_tweets.txt"
PROCESSED_DATA = "data/processed_tweets.csv"

In [3]:
stop_words = set(stopwords.words('english'))


In [4]:
# pre-process tweet
def preprocess(text, isStopWords = False):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    # convert to lower case
    tokens = [token.lower() for token in tokens]
    # print("Lower case: {}\n".format(tokens))

    # remove html tags
    html_regex = re.compile("<[^>]+>")
    tokens = [token for token in tokens if not html_regex.match(token)]
    # print("removed HTML: {}\n".format(tokens))

    # replace all mentions by @user
    mention_regex = re.compile("(rt @[\w_]+)")
    tokens = ['' if mention_regex.match(token) else token for token in tokens]
    # print("removed Mentions: {}\n".format(tokens))

    # replace all urls by !url
    url_regex = re.compile('http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+')
    tokens = ['' if url_regex.match(token) else token for token in tokens]
    # print("removed URL: {}\n".format(tokens))

    # remove punctuations
    # punctuation = "\"@!$%&'()*+,-./:;<=>?[\]^_`{|}~"
    table = str.maketrans('', '', string.punctuation)
    tokens = [token.translate(table) for token in tokens]
    # print("removed Punctuations: {}\n".format(tokens))

    # remove all tokens that are not alphabetic
    tokens = [token for token in tokens if token.isalpha()]
    # print("removed Non-alphabets: {}\n".format(tokens))

    # remove stop words
    if isStopWords == False:
        tokens = [token for token in tokens if not token in stop_words]
    # print("removed Stopwords: {}\n".format(tokens))

    tweet = ' '.join(token for token in tokens if len(token)!=0)
    return tweet


In [11]:
# Read raw dataset, process and print to file
def read_and_print_data():
    file = open(WORK_DIR+RAW_DATA, "r")
    lines = file.readlines()
    data = []
    i=0
    for line in lines:
        tweet = line.split("\t")
        label = tweet[0]
        text = json.loads(tweet[1])
        text = str(text['full_text'])
        text = preprocess(text)
        data.append([text, label])
        i+=1
        if i %1000 ==0:
            print("Line {}: {}, label: {}\n".format(i, text, label))
#             break
    output = pd.DataFrame(data, columns=['tweets', 'labels'])
    
    output.to_csv(WORK_DIR+PROCESSED_DATA, index=False)
    file.close()

In [None]:
read_and_print_data()


Line 1000: michael bloomberg says election better forum remove trump office formal impeachment inquiry, label: Politics

Line 2000: texas congressman pulls endorsement julián castro support joe biden, label: Politics

Line 3000: cherokee nation names first ever delegate congress, label: Politics

Line 4000: u officials huddle facebook google tech giants talk election, label: Politics

Line 5000: trump cutting medicare give tax breaks billionaires, label: Politics

Line 6000: trump consistent criticism iran pushes u point potential conflict, label: Politics

Line 7000: trump hits back ally denounces weakness iran, label: Politics

Line 8000: news analysis violence escalates across asia washington chosen inaction governments ignoring trump administration calls calm, label: Politics

Line 9000: power went de blasio, label: Politics

Line 10000: pete buttigieg criticized responses elizabeth warren provided asked whether medicare plan raise middleclass taxes, label: Politics

Line 11000: pr