In [93]:
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
nlp = spacy.load('en_core_web_sm', disable=[ 'parser', 'ner'])

In [50]:
df_raw = pd.read_pickle('df_raw.pickle')

In [51]:
#reindex the columns for easier viewing
cols = df_raw.columns.tolist()

cols.insert(2, cols.pop(cols.index('url')))

df_raw= df_raw.reindex(columns= cols)

### Preprocessing

In [52]:
def preprocess(tweet):
    """
    Takes in tweet and performs initial text cleaning/preprocessing.
    """
    #make sure doc is string
    tweet=str(tweet)
    #lowercase-- not changing anything to lowercase yet due to proper nouns being very important. want to use Spacy to detect later.
    #tweet = tweet.lower()
    #get rid of urls
    rem_url=re.sub(r'http\S+', '', tweet)
    #gets rid of @ tags
    rem_tag = re.sub('@\S+', '', rem_url)
    #gets rid of # in hashtag but keeps content of hashtag
    rem_hashtag = re.sub('#', '', rem_tag)
    #gets rid of numbers
    rem_num = re.sub('[0-9]+', '', rem_hashtag)
    #gets rid of special characters
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', rem_hashtag)

    return cleantext

In [53]:
df_raw['clean']=df_raw['content'].map(lambda x:preprocess(x))

In [54]:
df_raw['clean'].iloc[-1]

'The coronavirus pandemic is devastating U.S. restaurant chains. But  explains why financially stronger companies should see the light of day in the near future. WSJWhatsNow  '

### Tagging and Lemmatizing

We only want the nouns (and proper nouns) in each tweet for topic modeling, as they are the essence of article subjects. Let's tag the nouns and return the lemmatized versions of tehm in a single step.

Because we also have so much data, we will incoporate the NLP pipeline in order to shorten the processing time.

In [75]:
def noun_lemmatize_pipe(doc):
    """
    Takes in tweet and returns only the lemmatized version of nouns (including proper nouns).
    """
    lemma_list = [token.lemma_ for token in doc
                  if token.pos_ == "NOUN" or token.pos_ =="PROPN"] 
    return lemma_list

#create a pipeline in order to shorten processing time
def preprocess_pipe(texts):
    """
    Inputs noun_lemmative_pipe function into NLP pipeline for faster processing.
    """
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=20):
        preproc_pipe.append(noun_lemmatize_pipe(doc))
    return preproc_pipe

In [76]:
df_test = df_raw.head(60)

In [80]:
#let's time our function
%%time
#apply function and create a new column to house the outputs
df_raw['clean_lemmatized'] = preprocess_pipe(df_raw['clean'])


CPU times: user 2min 47s, sys: 738 ms, total: 2min 48s
Wall time: 2min 48s


In [83]:
df_raw[['content', 'clean', 'clean_lemmatized']].head(3)

Unnamed: 0,content,clean,clean_lemmatized
0,The Atlantic Daily: Will this decade be the ne...,The Atlantic Daily: Will this decade be the ne...,"[Atlantic, Daily, decade]"
1,There's plenty that's going wrong for Trump. H...,There's plenty that's going wrong for Trump. H...,"[plenty, Trump, thing, campaign, gap, Joe, Bid..."
2,"If Trump tries to steal the election, people w...","If Trump tries to steal the election, people w...","[Trump, election, people, coup, strategy, write]"


We've successfully filtered out the nouns and proper nouns, lemmitized them, while making sure our function runs on optimized time!

### Remove Additional Words

Let's filter out any additional words that may appear in the tweets but aren't related to article subjects, like the names of the publications and common headline section titles.

In [87]:
removal_words= ["Times", "Wall", "Street", "Journal", "New", "Yorker", "York", "Medium", "Wired", "Financial", "Washington", "Post", "Business", "Insider", "Economist", "The", "Atlantic", "Daily", "Weekly" ]

In [88]:
df_raw['clean_lemmatized'] = df_raw['clean_lemmatized'].apply(lambda x: [word for word in x if word not in removal_words])

In [89]:
df_raw[['content', 'clean', 'clean_lemmatized']].head(3)

Unnamed: 0,content,clean,clean_lemmatized
0,The Atlantic Daily: Will this decade be the ne...,The Atlantic Daily: Will this decade be the ne...,[decade]
1,There's plenty that's going wrong for Trump. H...,There's plenty that's going wrong for Trump. H...,"[plenty, Trump, thing, campaign, gap, Joe, Bid..."
2,"If Trump tries to steal the election, people w...","If Trump tries to steal the election, people w...","[Trump, election, people, coup, strategy, write]"


### Vectorize
Now let's rejoin our twice cleaned, lemmatized list of nouns and pronouns!

In [91]:
df_raw['clean_final'] = df_raw['clean_lemmatized'].apply(lambda x: ' '.join(x))

In [92]:
df_raw[['content', 'clean', 'clean_lemmatized', 'clean_final']].head(3)

Unnamed: 0,content,clean,clean_lemmatized,clean_final
0,The Atlantic Daily: Will this decade be the ne...,The Atlantic Daily: Will this decade be the ne...,[decade],decade
1,There's plenty that's going wrong for Trump. H...,There's plenty that's going wrong for Trump. H...,"[plenty, Trump, thing, campaign, gap, Joe, Bid...",plenty Trump thing campaign gap Joe Biden report
2,"If Trump tries to steal the election, people w...","If Trump tries to steal the election, people w...","[Trump, election, people, coup, strategy, write]",Trump election people coup strategy write


In [95]:
#define vectorizer
tfidf = TfidfVectorizer()
#fit on fully cleaned dataframe column
doc_term_matrix = tfidf.fit_transform(df_raw['clean_final'])
#turn matrix into a dataframe with words as columns
matrix_df = pd.DataFrame(doc_term_matrix.toarray(), columns=tfidf.get_feature_names())

In [96]:
matrix_df

Unnamed: 0,00,000,03,07,08,09,0s,10,100,100bn,...,ﬂoor,ﬂora,ﬂow,ﬂuﬀy,ﬂygskam,ﬂying,𝐀𝐯𝐚𝐢𝐥𝐚𝐛𝐥𝐞,𝐔𝐩𝐝𝐚𝐭𝐞𝐬,𝘧𝘰𝘳,𝘩𝘪𝘮
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [107]:
matrix_df.columns[1:200]

Index(['000', '03', '07', '08', '09', '0s', '10', '100', '100bn', '104bn',
       ...
       '356bn', '35bn', '36', '360bn', '360i', '369bn', '36bn', '37', '370bn',
       '37b'],
      dtype='object', length=199)

### Next Steps
This doc-term matrix with Spacy tagging and lemmitization is much better than only using NLTK. We've cut down by 1000 terms/columns! 

However, we still need to clean out the columns that have special characters and numbers we weren't able to catch the first time around