### Models:
- LSA + CountVectorize 
- LSA + TF-IDF 

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.util import ngrams
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jennihawk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#got tweets at 1:15 Pacific Time 10/14/22
tweets = pd.read_csv('/Users/jennihawk/Documents/Data Science Projects/Project_NLP/TweetBatch3.csv')
tweets

Unnamed: 0,text,cleaned
0,@ReallyAmerican1 #Roevember and\n#ForThePeople...,roevember and forthepeople and votebluein2022...
1,RT @sandibachom: IS THIS THING ON???!!This is ...,rt is this thing on this is pathetic acting se...
2,RT @sandibachom: IS THIS THING ON???!!This is ...,rt is this thing on this is pathetic acting se...
3,RT @tleehumphrey: Today is the beginning of th...,rt today is the beginning of the inquiry into ...
4,RT @AdamKinzinger: Mitch McConnell.\nKevin McC...,rt mitch mcconnell kevin mccarthy they both kn...
...,...,...
34988,RT @Adrian_Fontes: The January 6th committee j...,rt fontes the january 6th committee just concl...
34989,#January6thCommitteeHearings and everyone runn...,january6thcommitteehearings and everyone runn...
34990,RT @sandibachom: IS THIS THING ON???!!This is ...,rt is this thing on this is pathetic acting se...
34991,So they are gonna subpoena Trump I am guessing...,so they are gonna subpoena trump am guessing t...


### Remove stop words from cleaned column

In [3]:
stop_words = stopwords.words('english')
stop_words.extend(['rt', 'january', 'january6thcommitteehearings', 'ja', 'january6thcomm'])
tweets['cleaned'] = tweets['cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [4]:
#tweets.head()

### Tokenize Words in Cleaned Column

In [5]:
tweets['tokenized'] = tweets.apply(lambda row: nltk.word_tokenize(row['cleaned']), axis=1)

In [6]:
tweets.head()

Unnamed: 0,text,cleaned,tokenized
0,@ReallyAmerican1 #Roevember and\n#ForThePeople...,roevember forthepeople votebluein2022 standwit...,"[roevember, forthepeople, votebluein2022, stan..."
1,RT @sandibachom: IS THIS THING ON???!!This is ...,thing pathetic acting sec defense chris miller...,"[thing, pathetic, acting, sec, defense, chris,..."
2,RT @sandibachom: IS THIS THING ON???!!This is ...,thing pathetic acting sec defense chris miller...,"[thing, pathetic, acting, sec, defense, chris,..."
3,RT @tleehumphrey: Today is the beginning of th...,today beginning inquiry trudeau gov use emerge...,"[today, beginning, inquiry, trudeau, gov, use,..."
4,RT @AdamKinzinger: Mitch McConnell.\nKevin McC...,mitch mcconnell kevin mccarthy knew trump resp...,"[mitch, mcconnell, kevin, mccarthy, knew, trum..."


## Countvectorizer + LSA 

### Countvectorizer
- a method to convert text to numerical data
- By default, Countvectorizer converts the text to lowercase and uses word-level tokenization.
- Countervectorize will do word tokenization not other types of tokenization. CountVectorizer takes list. 

In [7]:
corpus = list(tweets.cleaned)

In [8]:
#turn corpus into a document-term matrix
vectorizer = CountVectorizer()
doc_term = vectorizer.fit_transform(corpus)
doc_term_df = pd.DataFrame(doc_term.toarray(), columns=vectorizer.get_feature_names_out())

In [9]:
doc_term_df.shape

(34993, 9399)

In [10]:
#word_count = sum([len(d.split(' ')) for d in corpus]) > 100000
#word_count

#### Reduce Dimensionality 
- LSA is just SVD on the document term matrix

In [11]:
#TruncatedSVD breaks everything down into designated number of topics. Components = number of topics
lsa = TruncatedSVD(4)
lsa.fit(doc_term)
### END SOLUTION
lsa

TruncatedSVD(n_components=4)

In [12]:
topic_term = lsa.components_.round(3)
topic_term

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., -0., -0., ..., -0., -0., -0.],
       [-0., -0.,  0., ..., -0., -0., -0.]])

#### Turn topic_term into dataframe

In [31]:
topic_term_df = pd.DataFrame(topic_term.round(3),
                index = ["component_1", "component_2", "component_3", "component_4"],
                columns = vectorizer.get_feature_names_out())
#topic_term_df

#### Easier to read: Function displays the top terms in each topic 

In [14]:
def display_topics(model, feature_names, no_top_words, topic_names = None): 
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: ", topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("\n")
    return model, feature_names, no_top_words

#### Get 8 Top Terms for each Topic

In [32]:
# number is the number of top terms you want to see
output = display_topics(lsa, vectorizer.get_feature_names_out(), 8)
output


Topic  1
trump, knew, responsible, called, mcconnell, mccarthy, mitch, kevin

Topic  2
trump, violence, democracy, 6th, attack, mark, summoned, excuse

Topic  3
mark, democracy, gop, country, maga, without, love, overthrowing

Topic  4
lost, stone, roger, video, help, coup, yet, new




(TruncatedSVD(n_components=4),
 array(['00', '000', '01', ..., 'zoom', 'zuma', 'zzy'], dtype=object),
 8)

#### Take a look at the top words in the topics, and using your human brain, name them.

In [16]:
display_topics(lsa, vectorizer.get_feature_names(), 8, ['Mitch_Kevin_Knew', 'Hamill_Democracy1','Hamill_Democracy2','Roger_Stone_Video']);


Topic:  Mitch_Kevin_Knew
trump, knew, responsible, called, mcconnell, mccarthy, mitch, kevin

Topic:  Hamill_Democracy1
trump, violence, democracy, 6th, attack, mark, summoned, excuse

Topic:  Hamill_Democracy2
mark, democracy, gop, country, maga, without, love, overthrowing

Topic:  Roger_Stone_Video
lost, stone, roger, video, help, coup, yet, new




#### Determine which topics are in each document. Transform the original doc_term matrix into a document-topic matrix.

In [17]:
doc_topic = lsa.transform(doc_term)
doc_topic.shape

(34993, 4)

#### Turn the doc_topic matrix into a dataframe
- Numbers tell you how much of each topic is in the document.

In [18]:
doc_topic_df = pd.DataFrame(doc_topic.round(5), index = corpus, columns = ['Mitch_Kevin_Knew', 'Hamill_Democracy1','Hamill_Democracy2','Roger_Stone_Video'])
doc_topic_df.head(50)

Unnamed: 0,Mitch_Kevin_Knew,Hamill_Democracy1,Hamill_Democracy2,Roger_Stone_Video
roevember forthepeople votebluein2022 standwithukraine peopleoverpolitics oathbreakermaga lgbtqhistorymonth inflationreductionact republicanwaronseniors socialsecurityisourmoney,0.00019,0.00038,-2e-05,-0.00015
thing pathetic acting sec defense chris miller person deploy national,0.00289,0.00711,-0.00261,0.0022
thing pathetic acting sec defense chris miller person deploy national,0.00289,0.00711,-0.00261,0.0022
today beginning inquiry trudeau gov use emergencies act freedomconvoy wha,0.00358,0.00789,-0.00321,-0.00147
mitch mcconnell kevin mccarthy knew trump responsible called backed,2.70572,-1.23921,0.16604,0.02889
shit heavy take care,0.00239,0.00445,-0.00191,-0.00155
another graphic designed talented team go ahead use share tweet truth pl,0.00374,0.00733,-0.00326,-0.0021
new video trump lost yet decided hatch coup help mark meadows roger stone rudy giuliani,0.98676,1.67205,1.39896,3.01901
still cant get fact nancy pelosi sh ts wanted smoke donald trump,0.73402,0.71543,-0.27109,-0.15885
maga gop mark hamill correct love country without creating violence overthrowing democracy,0.15235,0.66286,3.02904,-1.46592


## LSA + TF-IDF 

### TF-IDF Vectorizer
- Instead of counts filling up matrix each value will be a TF-IDF value which is the term frequency or the percent of the document that consists of a particular term times the inverse document frequency, which is how rare the term is. 

In [33]:
cv_tfidf = TfidfVectorizer()
tf_doc_term = cv_tfidf.fit_transform(corpus).toarray()
tf_doc_term_df = pd.DataFrame(tf_doc_term, columns = cv_tfidf.get_feature_names_out())

In [20]:
tf_doc_term.shape

(34993, 9399)

In [21]:
tf_doc_term_df.iloc[:, : 100]

Unnamed: 0,00,000,01,05,08,09,10,100,1000,1000x,...,29,292,29m,2a,2d,2day,2ndwin,2pro,30,30k
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### LSA: Reduce Dimensionality 
- LSA is just SVD on the document term matrix

In [22]:
lsa2 = TruncatedSVD(4)
lsa2.fit(tf_doc_term)
lsa2

TruncatedSVD(n_components=4)

In [23]:
tf_topic_term = lsa2.components_.round(3)
tf_topic_term

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### Turn topic_term into dataframe

In [24]:
tf_topic_term_df = pd.DataFrame(tf_topic_term.round(3),
                index = ["component_1", "component_2", "component_3", "component_4"],
                columns = cv_tfidf.get_feature_names_out())
tf_topic_term_df

Unnamed: 0,00,000,01,05,08,09,10,100,1000,1000x,...,yup,zacksnyder,zelazny,zero,zim,zimbabwe,zone,zoom,zuma,zzy
component_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,-0.0,0.0,...,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Easier to read: Function displays the top terms in each topic 

In [25]:
def display_topics(model, feature_names, no_top_words, topic_names = None): 
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: ", topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("\n")
    return model, feature_names, no_top_words

#### Get 8 terms for each topic

In [36]:
# number is the number of top terms you want to see
output = display_topics(lsa2, cv_tfidf.get_feature_names_out(), 8)
output


Topic  1
kevin, backed, mccarthy, mitch, mcconnell, responsible, called, knew

Topic  2
mark, democracy, violence, hamill, creating, correct, overthrowing, love

Topic  3
decided, hatch, rudy, giuliani, meadows, lost, yet, stone

Topic  4
demands, oath, unanimously, deserves, history, voted, subpoena, testify




(TruncatedSVD(n_components=4),
 array(['00', '000', '01', ..., 'zoom', 'zuma', 'zzy'], dtype=object),
 8)

#### Name the topics

In [27]:
display_topics(lsa2, cv_tfidf.get_feature_names(), 8, ['Mitch_Kevin_Knew', 'Hamill_Democracy','Meadows_Guiliani_Stone','Subpoena_Trump']);


Topic:  Mitch_Kevin_Knew
kevin, backed, mccarthy, mitch, mcconnell, responsible, called, knew

Topic:  Hamill_Democracy
mark, democracy, violence, hamill, creating, correct, overthrowing, love

Topic:  Meadows_Guiliani_Stone
decided, hatch, rudy, giuliani, meadows, lost, yet, stone

Topic:  Subpoena_Trump
demands, oath, unanimously, deserves, history, voted, subpoena, testify




#### Determine which topics are in each document. Transform the original doc_term matrix into a document-topic matrix and save it as doc_topic.

In [28]:
doc_topic2 = lsa2.transform(doc_term)
doc_topic2.shape

(34993, 4)

#### Turn doc_topic matrix into a dataframe

In [29]:
doc_topic_df2 = pd.DataFrame(doc_topic2.round(5), index = corpus, columns = ["Mitch_Kevin_Knew", "Hamill_Democracy","Meadows_Guiliani_Stone","Subpoena_Trump"])
doc_topic_df2

Unnamed: 0,Mitch_Kevin_Knew,Hamill_Democracy,Meadows_Guiliani_Stone,Subpoena_Trump
roevember forthepeople votebluein2022 standwithukraine peopleoverpolitics oathbreakermaga lgbtqhistorymonth inflationreductionact republicanwaronseniors socialsecurityisourmoney,0.00002,0.00016,0.00011,0.00038
thing pathetic acting sec defense chris miller person deploy national,0.00020,0.00288,0.00910,0.00836
thing pathetic acting sec defense chris miller person deploy national,0.00020,0.00288,0.00910,0.00836
today beginning inquiry trudeau gov use emergencies act freedomconvoy wha,0.00026,0.00242,0.00319,0.00389
mitch mcconnell kevin mccarthy knew trump responsible called backed,2.98070,0.01766,0.03323,0.05555
...,...,...,...,...
fontes 6th committee concluded final hearing ahead midterms brave leaders remind us,0.00846,0.05987,0.04541,0.10605
everyone running,0.00124,0.00272,0.00378,0.00762
thing pathetic acting sec defense chris miller person deploy national,0.00020,0.00288,0.00910,0.00836
gonna subpoena trump guessing stop calling 45th start calling 5th,0.23377,0.22353,0.21675,0.53459
