In [196]:
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import spacy

In [190]:
df_raw = pd.read_pickle('df_raw.pickle')

In [4]:
#turn to csv so we can store locally and store as database in sql
df_raw.to_csv('/Users/juliaqiao/Documents/Metis/Proj_4_storage/df_raw.csv')

### Preprocessing

In [102]:
df_raw.columns

Index(['user', 'date', 'outlinks', 'content', 'url'], dtype='object')

In [191]:
#reindex the columns for easier viewing
cols = df_raw.columns.tolist()

cols.insert(2, cols.pop(cols.index('url')))

df_raw= df_raw.reindex(columns= cols)


In [14]:
# get dictionary value of key: username
# handle = [d[0].get('username') for d in df_raw.user if d]

In [192]:
def preprocess(tweet):
    """
    Takes in tweet and performs initial text cleaning/preprocessing.
    """
    #make sure doc is string
    tweet=str(tweet)
    #lowercase-- not changing anything to lowercase yet due to proper nouns being very important. want to use Spacy to detect later.
    #tweet = tweet.lower()
    #get rid of urls
    rem_url=re.sub(r'http\S+', '', tweet)
    #gets rid of @ tags
    rem_tag = re.sub('@\S+', '', rem_url)
    #gets rid of # in hashtag but keeps content of hashtag
    rem_hashtag = re.sub('#', '', rem_tag)
    #gets rid of numbers- holding off for now due to numbers being a great way to conserve space in twitter so they may be significant
    #rem_num = re.sub('[0-9]+', '', rem_hashtag)
    #gets rid of special characters
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', rem_hashtag)

    return cleantext

In [193]:
df_raw['clean']=df_raw['content'].map(lambda x:preprocess(x))

### Tokenize

In [194]:
tt = TreebankWordTokenizer()
df_raw['clean'] = df_raw['clean'].apply(tt.tokenize)

 ### Lemmatize 

In [202]:
lemmatizer = WordNetLemmatizer()
df_raw['clean'] = df_raw['clean'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x]) # Lemmatize every word.

In [203]:
#run example
df_raw['clean'].iloc[-1]

['The',
 'coronavirus',
 'pandemic',
 'is',
 'devastating',
 'U.S.',
 'restaurant',
 'chains.',
 'But',
 'explains',
 'why',
 'financially',
 'stronger',
 'company',
 'should',
 'see',
 'the',
 'light',
 'of',
 'day',
 'in',
 'the',
 'near',
 'future.',
 'WSJWhatsNow']

### Remove Stop Words

In [204]:
stop_words = list(stopwords.words('english'))
#stop_words

In [205]:
#stop_words = set(stopwords.words('english'))
df_raw['clean'] = df_raw['clean'].apply(lambda x: [word for word in x if word not in stop_words])

In [207]:
#run example
df_raw['clean'].iloc[-1]

['The',
 'coronavirus',
 'pandemic',
 'devastating',
 'U.S.',
 'restaurant',
 'chains.',
 'But',
 'explains',
 'financially',
 'stronger',
 'company',
 'see',
 'light',
 'day',
 'near',
 'future.',
 'WSJWhatsNow']

We did not use a stemmer. Lemmatizer was better.

Remove any leftover punctuation:


In [237]:
punctuation = ['.', '/', ',', '%', '!', ':', "'s", "'ve", '"', '*', '“', "’", "`", '?', '”', "``", "‘"] 
df_raw['clean'] = df_raw['clean'].apply(lambda x: [word for word in x if word not in punctuation])

In [238]:
df_raw['clean'].sample(20)

18215    [House, affordability, actually, getting, bett...
1146     [Facts, truth, writes., The, truth, Breonna, T...
615                                 [Just, doe, n't, mean]
19380    [Like, surly, teen-ager, new, drama, series, E...
9647     [Cut, eye, flatworm, —, grow, back., Stick, ey...
13355    [The, Svalbard, Satellite, Station, sits, insi...
6303       [What, take, contortionist, Cirque, du, Soleil]
16309    [I, 'm, private, online, tutor, wealthy, famil...
2425     [', I, 'm, extremely, concerned, ', A, former,...
17025    [Former, World, Bank, president, fault, Trump,...
8999     [Advocates, believe, CBG, could, become, viabl...
10764    [Opinion, The, problem, relying, precedent, pr...
19511    [Brands, scrambling, amend, remove, rework, ad...
12697    [Miss, today, AppleEvent, Here, breakdown, cat...
16606    [Justice, Dept., scrutinizes, White, House-con...
17072    [In, one, many, provocative, grammar, take, su...
13376    [The, Parthenon, Marbles, British, Museum, sin.

### Vectorize

In [239]:
df_raw['clean_vec'] = df_raw['clean'].apply(lambda x: ' '.join(x))


In [240]:
df_raw['clean_vec']

0                 The Atlantic Daily Will decade new 1850s
1        There plenty going wrong Trump. Here four thin...
2        If Trump try steal election people surely prot...
3        The Trump campaign election-security operation...
4        Even Joe Biden win decisively next week coming...
                               ...                        
19995    General Mills make Cheerios cereal Yoplait yog...
19996    From When viral panic subsides Fed rethink mor...
19997       The government want send money—but soon arrive
19998    They frozen asparagus said one disappointed sh...
19999    The coronavirus pandemic devastating U.S. rest...
Name: clean_vec, Length: 200000, dtype: object

In [241]:
tfidf = TfidfVectorizer()


In [256]:
doc_term_matrix = tfidf.fit_transform(df_raw['clean_vec'])

In [257]:
df = pd.DataFrame(doc_term_matrix.toarray(), columns=tfidf.get_feature_names())

In [258]:
df.head()

Unnamed: 0,00,000,00000001,000003,0000047,000km,000kph,000m,000th,001,...,ﬂora,ﬂow,ﬂuﬀy,ﬂy,ﬂygskam,ﬂying,𝐀𝐯𝐚𝐢𝐥𝐚𝐛𝐥𝐞,𝐔𝐩𝐝𝐚𝐭𝐞𝐬,𝘧𝘰𝘳,𝘩𝘪𝘮
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [255]:
df.columns[2000:2100]

Index(['abril', 'abroad', 'abrogation', 'abrupt', 'abruptly', 'abruzzi', 'abs',
       'absence', 'absent', 'absentee', 'absenteeism', 'absentia', 'absinthe',
       'absolute', 'absolutely', 'absolutes', 'absolution', 'absolutism',
       'absolve', 'absolved', 'absolves', 'absorb', 'absorbable', 'absorbed',
       'absorber', 'absorbing', 'absorbs', 'absorption', 'abstain',
       'abstained', 'abstaining', 'abstains', 'abstemious', 'abstinence',
       'abstract', 'abstracted', 'abstraction', 'abstruse', 'absurd',
       'absurdism', 'absurdist', 'absurdity', 'absurdly', 'abt', 'abu',
       'abubakar', 'abuela', 'abundance', 'abundant', 'abundantly', 'abus',
       'abuse', 'abused', 'abuser', 'abusers', 'abuses', 'abusing', 'abusive',
       'abusiveness', 'abusó', 'abut', 'abuzz', 'aby', 'abysmal', 'abyss',
       'ac', 'aca', 'academia', 'academic', 'academically', 'academics',
       'academy', 'acadia', 'acapulco', 'acb', 'acc', 'accede', 'accel',
       'accelerant', 'acceler

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

class DenseTfidfVectorizer(TfidfVectorizer):

    def transform(self, raw_documents, copy=True):
        X = super().transform(raw_documents, copy=copy)
        df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
        return df

    def fit_transform(self, raw_documents, y=None):
        X = super().fit_transform(raw_documents, y=y)
        df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
        return df