In [295]:
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import emoji
import spacy

In [297]:
df_raw = pd.read_pickle('df_raw.pickle')

In [4]:
#turn to csv so we can store locally and store as database in sql
df_raw.to_csv('/Users/juliaqiao/Documents/Metis/Proj_4_storage/df_raw.csv')

### Preprocessing

In [102]:
df_raw.columns

Index(['user', 'date', 'outlinks', 'content', 'url'], dtype='object')

In [298]:
#reindex the columns for easier viewing
cols = df_raw.columns.tolist()

cols.insert(2, cols.pop(cols.index('url')))

df_raw= df_raw.reindex(columns= cols)


In [14]:
# get dictionary value of key: username
# handle = [d[0].get('username') for d in df_raw.user if d]

In [303]:
def preprocess(tweet):
    """
    Takes in tweet and performs initial text cleaning/preprocessing.
    """
    #make sure doc is string
    tweet=str(tweet)
    #lowercase-- not changing anything to lowercase yet due to proper nouns being very important. want to use Spacy to detect later.
    #tweet = tweet.lower()
    #get rid of urls
    rem_url=re.sub(r'http\S+', '', tweet)
    #gets rid of @ tags
    rem_tag = re.sub('@\S+', '', rem_url)
    #gets rid of # in hashtag but keeps content of hashtag
    rem_hashtag = re.sub('#', '', rem_tag)
    #gets rid of numbers
    rem_num = re.sub('[0-9]+', '', rem_hashtag)
    #gets rid of emojis
    rem_emoji = re.sub(r'[^\w\s,]', '', rem_num)
    #gets rid of special characters
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', rem_emoji)

    return cleantext

In [304]:
df_raw['clean']=df_raw['content'].map(lambda x:preprocess(x))

### Tokenize

In [305]:
tt = TreebankWordTokenizer()
df_raw['clean'] = df_raw['clean'].apply(tt.tokenize)

In [306]:
df_raw['clean'].iloc[-1]

['The',
 'coronavirus',
 'pandemic',
 'is',
 'devastating',
 'US',
 'restaurant',
 'chains',
 'But',
 'explains',
 'why',
 'financially',
 'stronger',
 'companies',
 'should',
 'see',
 'the',
 'light',
 'of',
 'day',
 'in',
 'the',
 'near',
 'future',
 'WSJWhatsNow']

 ### Lemmatize 

In [307]:
lemmatizer = WordNetLemmatizer()
df_raw['clean'] = df_raw['clean'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x]) # Lemmatize every word.

In [308]:
#run example
df_raw['clean'].iloc[-1]

['The',
 'coronavirus',
 'pandemic',
 'is',
 'devastating',
 'US',
 'restaurant',
 'chain',
 'But',
 'explains',
 'why',
 'financially',
 'stronger',
 'company',
 'should',
 'see',
 'the',
 'light',
 'of',
 'day',
 'in',
 'the',
 'near',
 'future',
 'WSJWhatsNow']

### Remove Stop Words

In [309]:
#removes default stop_words
stop_words = list(stopwords.words('english'))

In [310]:
#stop_words = set(stopwords.words('english'))
df_raw['clean'] = df_raw['clean'].apply(lambda x: [word for word in x if word not in stop_words])

In [311]:
#run example
df_raw['clean'].iloc[-1]

['The',
 'coronavirus',
 'pandemic',
 'devastating',
 'US',
 'restaurant',
 'chain',
 'But',
 'explains',
 'financially',
 'stronger',
 'company',
 'see',
 'light',
 'day',
 'near',
 'future',
 'WSJWhatsNow']

We did not use a stemmer. Lemmatizer was better.

Remove any leftover punctuation:


In [312]:
punctuation = ['.', '/', ',', '%', '!', ':', "'s", "'ve", '"', '*', '“', "’", "`", '?', '”', "``", "‘"] 
df_raw['clean'] = df_raw['clean'].apply(lambda x: [word for word in x if word not in punctuation])

In [313]:
df_raw['clean'].sample(20)

6578     [This, health, scare, ha, created, nation, met...
16710    [Driven, belief, disruption, progress, cost, I...
7764     [If, President, Trump, continues, run, populis...
14408    [For, decade, told, desalination, would, one, ...
6442     [The, country, wonder, whether, politician, st...
7123     [Thousands, Migrants, Attempt, Cross, Europe, ...
14387    [If, Turkeys, president, get, way, fourth, tra...
12761    [Two, dam, Michigan, breached, day, heavy, rai...
8487     [As, lockdown, end, cycling, look, set, helpin...
19247    [What, Meryl, Streep, The, Post, teach, u, wom...
5402     [Nikolas, share, price, day, investment, firm,...
10644    [In, Opinion, Kids, still, miss, sleepover, Sc...
2715     [Because, staying, close, home, key, safety, p...
2108     [Are, eager, help, shape, The, Economists, dig...
12112    [Indigenous, asylumseekers, face, disproportio...
8752     [This, animal, liberation, group, actually, wa...
7200     [Teenage, brain, run, raw, emotion, Thats, nec.

### Remove any additional words like pronouns and other fillers

In [314]:
pronoun_words = ['The', "the", "I", "he", "He", "She", "she", "her", "Her", "We", "we", "Me", "me", "Us", "us", "it","It", "Them", "them", "They", "they", "There", "there", "that", "That", "This", "this", "You", "you", "who", "Who", "whom" , "Whom", "whose", "Whose", "what", "What", "which", "Which", "my", "My", "mine","Mine", "your", "Your", "yours", "Yours", "his", "His", "hers" "Hers", "its", "Its", "It's", "it's", "Is", "is" "was", "Was", "In", "in", "across", "Across", "rather", "Rather", "roughly", "Roughly", "why", "Why", "where", "Where", "here", "Here", "A", "a", "Some", "some", "few", "Few", "None", "none", "more", "More", "Yes", "yes", "No", "no", "If", "if", "Let", "let", "Let's", "let's", "Came", "come", "go", "Go"]

In [315]:
#stop_words = set(stopwords.words('english'))
df_raw['clean'] = df_raw['clean'].apply(lambda x: [word for word in x if word not in pronoun_words])

In [319]:
df_raw['clean'].sample(20)

18307    [Gioncarlo, Valentines, photo, series, Soft, F...
19524    [Yet, Massachusetts, like, many, state, ha, co...
2618     [past, decade, ha, witnessed, dangerous, trend...
17317    [Tracing, killer, Crime, Punishment, tour, St,...
5634     [beginning, presidency, unique, logic, charact...
8592     [Trump, expected, sign, drug, pricing, executi...
7321     [Post, making, analysis, US, climate, data, ac...
13227    [coronavirus, pandemic, ha, taken, incalculabl...
10342    [never, reduce, risk, infection, zero, But, st...
18469    [Mitski, obsessive, fan, growing, independents...
12387    [Movies, crossword, topic, contention, Marriag...
767      [Moroccan, sex, crime, trial, fuel, fear, crac...
5648     [Watch, full, interview, ECB, president, speak...
7576     [weekend, brings, Leap, Year, sale, thats, rea...
6699     [Trumps, assault, US, Postal, Service, give, D...
833      [Borowitz, Report, Donald, Trump, said, much, ...
9777     [ECB, push, eurozone, bad, bank, clean, soured.

### Vectorize

In [320]:
df_raw['clean_vec'] = df_raw['clean'].apply(lambda x: ' '.join(x))


In [321]:
df_raw['clean_vec']

0                           Atlantic Daily Will decade new
1        Theres plenty thats going wrong Trump four thi...
2        Trump try steal election people surely protest...
3        Trump campaign electionsecurity operation key ...
4        Even Joe Biden win decisively next week coming...
                               ...                        
19995    General Mills make Cheerios cereal Yoplait yog...
19996    From When viral panic subsides Fed rethink mor...
19997            government want send moneybut soon arrive
19998    frozen asparagus said one disappointed shopper...
19999    coronavirus pandemic devastating US restaurant...
Name: clean_vec, Length: 200000, dtype: object

In [322]:
tfidf = TfidfVectorizer()


In [323]:
doc_term_matrix = tfidf.fit_transform(df_raw['clean_vec'])

In [324]:
df = pd.DataFrame(doc_term_matrix.toarray(), columns=tfidf.get_feature_names())

In [325]:
df.head()

Unnamed: 0,__,___,____,_____,_______________________,__________________________,_________________________________,__zz,_zz,_ツ_,...,ﬂora,ﬂow,ﬂuﬀy,ﬂy,ﬂygskam,ﬂying,𝐀𝐯𝐚𝐢𝐥𝐚𝐛𝐥𝐞,𝐔𝐩𝐝𝐚𝐭𝐞𝐬,𝘧𝘰𝘳,𝘩𝘪𝘮
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [326]:
df.columns[40000:41000]

Index(['lucretiuss', 'lucy', 'ludic', 'ludicrous', 'ludicrously',
       'ludicrousness', 'ludicrousshe', 'ludlow', 'ludosky', 'ludovic',
       ...
       'marcel', 'marcela', 'marcelino', 'marcella', 'marcellis', 'marcelo',
       'march', 'marcha', 'marchand', 'marcharse'],
      dtype='object', length=1000)

In [293]:
df.to_pickle("doc_term_matrix_0.pickle")

### Scratch

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

class DenseTfidfVectorizer(TfidfVectorizer):

    def transform(self, raw_documents, copy=True):
        X = super().transform(raw_documents, copy=copy)
        df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
        return df

    def fit_transform(self, raw_documents, y=None):
        X = super().fit_transform(raw_documents, y=y)
        df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
        return df