In [1]:
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import words
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

In [28]:
# data loading
path = 'C:/Users/48668/Desktop/FSS2020/DM/'
data = pd.read_json(path + 'data.json')
data.head()

Unnamed: 0,created_at,id_str,retweet_count,source,text
0,2020-03-30 20:50:35,1244728753617620992,14441,Twitter for iPhone,White House news conference at 5:00 P.M. Easte...
1,2020-03-30 17:46:15,1244682364284014592,15520,Twitter for iPhone,https://t.co/2hKJkP5Z6N
2,2020-03-30 17:11:59,1244673740866191360,19753,Twitter for iPhone,"On #NationalDoctorsDay, we recognize the remar..."
3,2020-03-30 17:05:33,1244672122414338048,39114,Twitter for iPhone,https://t.co/nzWJ8ViwbZ
4,2020-03-30 11:17:10,1244584449309892608,43360,Twitter for iPhone,Nancy Pelosi and the Democrats delayed the Wor...


In [29]:
# extract hashtags
def hashtags(text):
    pat = re.compile(r"#(\w+)")
    hashtags = pat.findall(text)
    return hashtags

data['hashtags'] = data.apply(lambda x: hashtags(x['text']), axis=1)

# if tweet contains hashtag (boolean)
data['if_has_hashtag'] = data.apply(lambda x: 1 if len(x['hashtags']) > 0 else 0, axis=1)

# number of hashtags
data['no_hashtag'] = data.apply(lambda x: len(x['hashtags']), axis=1)

# changing float64 to string
data['text'] = data['text'].astype(str)

# remove punctuation
def remove_punctuations(text):
    for punctuation in string.punctuation:
        if punctuation != '!': # leave exclamation mark
            text = text.replace(punctuation, '')
    return text

data['text'] = data.apply(lambda x: remove_punctuations(x['text']), axis=1)

# remove numbers 
def remove_numbers(text):
    return re.sub('[0-9]+', '', text)

data['text'] = data.apply(lambda x: remove_numbers(x['text']), axis=1)


# replace url with "url"
def replace_urls(text):
    return re.sub(r"http\S+", "url", text)

data['text'] = data.apply(lambda x: replace_urls(x['text']), axis=1)

# remove special characters

def remove_special(text):
    for char in ['€','�','‡','†','‰','™','•']:
        text = text.replace(char, '')
    return text

data['text'] = data.apply(lambda x: remove_special(x['text']), axis=1)

# if word starts with uppercase --> lowercase, if all chars are uppercase --> do nothing
def lower_case(text):
    to_lowercase = lambda text: " ".join(word if (word.isupper() == True & len(word) >= 1) else word.lower()
            for word in text.split())
    lowercase = to_lowercase(text)   
    return lowercase

data['text'] = data.apply(lambda x: lower_case(x['text']), axis=1)

#remove elongated words
def remove_elongated(text):
    el = []
    setofwords = set(words.words())
    for word in text.split():
        if word in setofwords:
            pass
        else:
            word=re.sub(r'(?i)(.)\1+', r'\1', word)
        el.append(word)
    return el


#data['text'] = data.apply(lambda x: remove_elongated(x['text']), axis=1)


In [30]:
data.head()

Unnamed: 0,created_at,id_str,retweet_count,source,text,hashtags,if_has_hashtag,no_hashtag
0,2020-03-30 20:50:35,1244728753617620992,14441,Twitter for iPhone,white house news conference at pm eastern than...,[],0,0
1,2020-03-30 17:46:15,1244682364284014592,15520,Twitter for iPhone,url,[],0,0
2,2020-03-30 17:11:59,1244673740866191360,19753,Twitter for iPhone,on nationaldoctorsday we recognize the remarka...,[NationalDoctorsDay],1,1
3,2020-03-30 17:05:33,1244672122414338048,39114,Twitter for iPhone,url,[],0,0
4,2020-03-30 11:17:10,1244584449309892608,43360,Twitter for iPhone,nancy pelosi and the democrats delayed the wor...,[],0,0


In [31]:
# TF-IDF vector extraction

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer

tknzr = TweetTokenizer()

def tokenize(text):
    tokens = tknzr.tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

vectorizer = TfidfVectorizer(tokenizer=tokenize,stop_words='english')
word_features = vectorizer.fit_transform(data['text'])

vectorizer.get_feature_names()
word_features = pd.DataFrame(word_features.todense(), columns = vectorizer.get_feature_names())
word_features.shape
word_features.head(10)

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,!,aaa,aap,ab,abaco,abandon,abba,abbott,abc,abcnew,...,—,‘,’,‚,“,”,„,…,‹,›
0,0.1073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.053141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.059762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.146762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
