In [1]:
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import words
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
# data loading
path = 'C:/Users/48668/Desktop/DM/'
data = pd.read_json(path + 'data.json')
data.head()

Unnamed: 0,created_at,id_str,retweet_count,source,text
0,2020-03-30 20:50:35,1244728753617620992,14441,Twitter for iPhone,White House news conference at 5:00 P.M. Easte...
1,2020-03-30 17:46:15,1244682364284014592,15520,Twitter for iPhone,https://t.co/2hKJkP5Z6N
2,2020-03-30 17:11:59,1244673740866191360,19753,Twitter for iPhone,"On #NationalDoctorsDay, we recognize the remar..."
3,2020-03-30 17:05:33,1244672122414338048,39114,Twitter for iPhone,https://t.co/nzWJ8ViwbZ
4,2020-03-30 11:17:10,1244584449309892608,43360,Twitter for iPhone,Nancy Pelosi and the Democrats delayed the Wor...


In [16]:

"""
This is preprocessing section. Following steps were taken:

    1. punctuation removal with '!' exception
    2. numbers removal
    3. url removals
    4. normalization to lowercase with exception for isupper()
    5. string to tokens
    6. stopwords removal
    7. stemming
    8. normalization of elongated words
"""

stop_words = list(stopwords.words('english'))

def remove_stop(tokens):
    filtered = []
    for word in tokens:
        if word.lower() not in stop_words:
            filtered.append(word) 
    return filtered


# changing float64 to string
data['text'] = data['text'].astype(str)

# remove punctuation
def remove_punctuations(text):
    for punctuation in string.punctuation:
        if punctuation != '!': # leave exclamation mark
            text = text.replace(punctuation, '')
    return text

# remove numbers 
def remove_numbers(text):
    return re.sub('[0-9]+', '', text)

# replace url with "url"
def replace_urls(text):
    return re.sub(r"http\S+", "url", text)

# remove special characters

def remove_special(tokens):
    t = []
    for token in tokens:
        if token not in ['�', 'â', '¦','€', '¤', 'à', '‡', '™','¸','Ø']:
            t.append(token)
    return t

# if word starts with uppercase --> lowercase, if all chars are uppercase --> do nothing
def lower_case(tokens):
    tokens = [(w.lower() if not w.isupper() else w) for w in tokens]
    return tokens

# print(lower_case(['AAAAA', 'army', 'Army'])) # test function
# print(remove_urls(remove_punctuations(remove_numbers('740 test test 3 99ma http://cnn.com')))) # test function

#remove elongated words
def remove_elongated(text):
    el = []
    setofwords = set(words.words())
    for word in text.split():
        if word in setofwords:
            pass
        else:
            word=re.sub(r'(?i)(.)\1+', r'\1', word)
        el.append(word)
    return el

#test = 'Aweeeesome president Trump greeeeat good Ameerica '
#print(remove_elongated(test))

data['text'] = data.apply(lambda x: remove_numbers(x['text']), axis=1)
data['text'] = data.apply(lambda x: remove_punctuations(x['text']), axis=1)
data['text'] = data.apply(lambda x: replace_urls(x['text']), axis=1)


# tokenize
tknzr = TweetTokenizer()
data['tokens'] = data.apply(lambda x: tknzr.tokenize(x['text']), axis=1)

# remove stop words
data['tokens'] = data.apply(lambda x: remove_stop(x['tokens']), axis=1)

# remove special char
data['tokens'] = data.apply(lambda x: remove_special(x['tokens']), axis=1)

# lower_case
data['tokens'] = data.apply(lambda x: lower_case(x['tokens']), axis=1)

# elongated words
#data['text'] = data.apply(lambda x: remove_elongated(x['text']), axis=1)
            

# stemming
ps = PorterStemmer() 
data['tokens'] = data.apply(lambda x: [ps.stem(w) for w in x['tokens']], axis=1)


In [17]:
data.head()

Unnamed: 0,created_at,id_str,retweet_count,source,text,tokens
0,2020-03-30 20:50:35,1244728753617620992,14441,Twitter for iPhone,White House news conference at PM Eastern Tha...,"[white, hous, news, confer, PM, eastern, thank..."
1,2020-03-30 17:46:15,1244682364284014592,15520,Twitter for iPhone,url,[url]
2,2020-03-30 17:11:59,1244673740866191360,19753,Twitter for iPhone,On NationalDoctorsDay we recognize the remarka...,"[nationaldoctorsday, recogn, remark, men, amp,..."
3,2020-03-30 17:05:33,1244672122414338048,39114,Twitter for iPhone,url,[url]
4,2020-03-30 11:17:10,1244584449309892608,43360,Twitter for iPhone,Nancy Pelosi and the Democrats delayed the Wor...,"[nanci, pelosi, democrat, delay, worker, helps..."


In [18]:
# append token to vocabulary for each row
def buildVocabulary(data):
    
    all_words = []
    
    def append_token(tokens, all_words):
        for token in tokens:
            all_words.append(token)

    for i, row in data.iterrows():
        append_token(row["tokens"],all_words)

    wordlist = nltk.FreqDist(all_words)
    
    return wordlist

a = buildVocabulary(data)

In [20]:
print("Most common words:")
print(a.most_common(100))

word_features = a.keys()
print('\nTotal number of features: ' + str(len(word_features)))

Most common words:
[('!', 9400), ('url', 4321), ('great', 2728), ('RT', 2138), ('amp', 2133), ('democrat', 1450), ('presid', 1415), ('peopl', 1319), ('countri', 1155), ('thank', 1136), ('state', 1042), ('get', 1026), ('year', 906), ('new', 902), ('trump', 891), ('border', 888), ('job', 872), ('news', 867), ('fake', 860), ('big', 844), ('want', 833), ('go', 824), ('time', 815), ('work', 804), ('american', 786), ('mani', 771), ('would', 729), ('republican', 726), ('make', 710), ('never', 701), ('vote', 681), ('US', 673), ('today', 662), ('america', 633), ('even', 620), ('good', 613), ('look', 611), ('much', 600), ('one', 580), ('unit', 561), ('realdonaldtrump', 561), ('media', 550), ('like', 535), ('come', 524), ('hous', 521), ('total', 519), ('back', 518), ('deal', 512), ('done', 505), ('win', 503), ('senat', 498), ('nation', 497), ('noth', 495), ('dem', 490), ('must', 487), ('impeach', 486), ('day', 485), ('china', 480), ('elect', 474), ('report', 457), ('donâ', 445), ('see', 435), ('c

In [21]:
def extract(tokens):
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in set(tokens))
    return features

In [None]:
data['word_features'] = data.apply(lambda x: extract(x['tokens']), axis=1)

In [None]:
data.head()