In [51]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.util import ngrams
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jennihawk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
#got tweets at 1:15 Pacific Time 10/14/22
tweets = pd.read_csv('/Users/jennihawk/Documents/Data Science/NLP_Unsupervised Learning/Project_NLP/TweetBatch3.csv')
tweets

Unnamed: 0,text,cleaned
0,@ReallyAmerican1 #Roevember and\n#ForThePeople...,roevember and forthepeople and votebluein2022...
1,RT @sandibachom: IS THIS THING ON???!!This is ...,rt is this thing on this is pathetic acting se...
2,RT @sandibachom: IS THIS THING ON???!!This is ...,rt is this thing on this is pathetic acting se...
3,RT @tleehumphrey: Today is the beginning of th...,rt today is the beginning of the inquiry into ...
4,RT @AdamKinzinger: Mitch McConnell.\nKevin McC...,rt mitch mcconnell kevin mccarthy they both kn...
...,...,...
34988,RT @Adrian_Fontes: The January 6th committee j...,rt fontes the january 6th committee just concl...
34989,#January6thCommitteeHearings and everyone runn...,january6thcommitteehearings and everyone runn...
34990,RT @sandibachom: IS THIS THING ON???!!This is ...,rt is this thing on this is pathetic acting se...
34991,So they are gonna subpoena Trump I am guessing...,so they are gonna subpoena trump am guessing t...


### Remove stop words from cleaned column

In [53]:
stop_words = stopwords.words('english')
tweets['cleaned'] = tweets['cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [54]:
#tweets.head()

### Tokenize Words in Cleaned Column

In [55]:
tweets['tokenized'] = tweets.apply(lambda row: nltk.word_tokenize(row['cleaned']), axis=1)

In [56]:
tweets.head()

Unnamed: 0,text,cleaned,tokenized
0,@ReallyAmerican1 #Roevember and\n#ForThePeople...,roevember forthepeople votebluein2022 standwit...,"[roevember, forthepeople, votebluein2022, stan..."
1,RT @sandibachom: IS THIS THING ON???!!This is ...,rt thing pathetic acting sec defense chris mil...,"[rt, thing, pathetic, acting, sec, defense, ch..."
2,RT @sandibachom: IS THIS THING ON???!!This is ...,rt thing pathetic acting sec defense chris mil...,"[rt, thing, pathetic, acting, sec, defense, ch..."
3,RT @tleehumphrey: Today is the beginning of th...,rt today beginning inquiry trudeau gov use eme...,"[rt, today, beginning, inquiry, trudeau, gov, ..."
4,RT @AdamKinzinger: Mitch McConnell.\nKevin McC...,rt mitch mcconnell kevin mccarthy knew trump r...,"[rt, mitch, mcconnell, kevin, mccarthy, knew, ..."


## Countvectorizer + LSA 

### Countvectorizer
- a method to convert text to numerical data
- By default, Countvectorizer converts the text to lowercase and uses word-level tokenization.
- if using 'tokenize' column  for loops or list comprehension will help handle lists of lists. CountVectorizer takes list. 
- Countervectorize will do word tokenization not other types of tokenization

In [57]:
corpus = list(tweets.cleaned)

In [70]:
#turn corpus into a document-term matrix
vectorizer = CountVectorizer()
doc_term = vectorizer.fit_transform(corpus)
doc_term_df = pd.DataFrame(doc_term.toarray(), columns=vectorizer.get_feature_names())



In [67]:
#dtm.head()

In [71]:
doc_term_df.shape

(34993, 9404)

In [72]:
doc_term_df.iloc[:, : 100]

Unnamed: 0,00,000,01,05,08,09,10,100,1000,1000x,100s,100yds,101,102,10am,10h,10percenttothebigguy,10pm,10th,11,118,1199,11thhour,12,12deoctubre,...,215,216,22,23,23pm,24,240,246,25,256,25th,26,261,271,28,29,292,29m,2a,2d,2day,2ndwin,2pro,30,30k
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34989,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34990,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34991,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### LSA: Reduce Dimensionality 
- LSA is just SVD on the document term matrix

In [81]:
#2 components: breaks everything down into two topics
lsa = TruncatedSVD(2)
lsa.fit(doc_term)
### END SOLUTION
lsa

In [79]:
topic_term = lsa.components_.round(3)
topic_term

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### Turn topic_term into dataframe

In [80]:
topic_term_df = pd.DataFrame(topic_term.round(3),
                index = ["component_1", "component_2"],
                columns = vectorizer.get_feature_names())
topic_term_df



Unnamed: 0,00,000,01,05,08,09,10,100,1000,1000x,100s,100yds,101,102,10am,10h,10percenttothebigguy,10pm,10th,11,118,1199,11thhour,12,12deoctubre,...,yikes,yo,yoga,yordan,yorhappy,york,youngkin,yourint,youtube,yovanovitch,yowza,yr,yrs,yu,yuge,yup,zacksnyder,zelazny,zero,zim,zimbabwe,zone,zoom,zuma,zzy
component_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.001,-0.0,0.0,0.0,0.0,0.027,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.003,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Easier to read: Function displays the top terms in each topic 

In [82]:
def display_topics(model, feature_names, no_top_words, topic_names = None): 
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: ", topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("\n")
    return model, feature_names, no_top_words

In [87]:
# number is the number of top terms you want to see
output = display_topics(lsa, vectorizer.get_feature_names(), 10)
output


Topic  1
trump, rt, knew, responsible, called, mcconnell, mccarthy, mitch, kevin, backed

Topic  2
trump, rt, democracy, violence, january6thcommitteehearings, mark, january, decided, 6th, attack




(TruncatedSVD(),
 ['00',
  '000',
  '01',
  '05',
  '08',
  '09',
  '10',
  '100',
  '1000',
  '1000x',
  '100s',
  '100yds',
  '101',
  '102',
  '10am',
  '10h',
  '10percenttothebigguy',
  '10pm',
  '10th',
  '11',
  '118',
  '1199',
  '11thhour',
  '12',
  '12deoctubre',
  '12h',
  '13',
  '139',
  '13th',
  '14',
  '147',
  '14a',
  '14th',
  '14x',
  '15',
  '150',
  '154',
  '157',
  '15s',
  '16',
  '17',
  '1776',
  '18',
  '180',
  '182',
  '1857',
  '187',
  '187minutes',
  '19',
  '1961',
  '1973',
  '1980',
  '1984',
  '1a',
  '1pm',
  '1st',
  '20',
  '2007',
  '2008',
  '2011',
  '2015',
  '2016',
  '2017',
  '2018',
  '2019',
  '202',
  '2020',
  '2021',
  '2022',
  '2022midterms',
  '2023',
  '2024',
  '2030',
  '20th',
  '21',
  '215',
  '216',
  '22',
  '23',
  '23pm',
  '24',
  '240',
  '246',
  '25',
  '256',
  '25th',
  '26',
  '261',
  '271',
  '28',
  '29',
  '292',
  '29m',
  '2a',
  '2d',
  '2day',
  '2ndwin',
  '2pro',
  '30',
  '30k',
  '31',
  '31st',
  '32'

#### Take a look at the top words in the two topics, and using your human brain, name them.

In [88]:
display_topics(lsa, vectorizer.get_feature_names(), 5, ['NotSure1', 'NotSure2']);


Topic:  NotSure1
trump, rt, knew, responsible, called

Topic:  NotSure2
trump, rt, democracy, violence, january6thcommitteehearings




#### Last step: task is to figure out which topics are in each document. Transform the original doc_term matrix into a document-topic matrix and save it as doc_topic.

In [89]:
doc_topic = lsa.transform(doc_term)
doc_topic.shape

(34993, 2)

#### Turn the doc_topic matrix into a dataframe

In [91]:
doc_topic_df = pd.DataFrame(doc_topic.round(5), index = corpus, columns = ["NotSure1", "NotSure2"])
doc_topic_df

Unnamed: 0,NotSure1,NotSure2
roevember forthepeople votebluein2022 standwithukraine peopleoverpolitics oathbreakermaga lgbtqhistorymonth inflationreductionact republicanwaronseniors socialsecurityisourmoney january6thcommitteehearings,0.07867,0.12598
rt thing pathetic acting sec defense chris miller person deploy national,0.59016,0.34938
rt thing pathetic acting sec defense chris miller person deploy national,0.59016,0.34938
rt today beginning inquiry trudeau gov use emergencies act freedomconvoy wha,0.52434,0.25453
rt mitch mcconnell kevin mccarthy knew trump responsible called backed,2.72227,-1.56010
...,...,...
rt fontes january 6th committee concluded final hearing ahead midterms brave leaders remind us,0.64235,0.45669
january6thcommitteehearings everyone running,0.08295,0.12551
rt thing pathetic acting sec defense chris miller person deploy national,0.59016,0.34938
gonna subpoena trump guessing stop calling 45th start calling 5th january6thcommitteehearings,0.68333,0.52330


## TF-IDF + LSA - not done yet

### TF-IDF Vectorizer
- Instead of counts filling up matrix each value will be a TF-IDF value which is the term frequency or the percent of the document that consists of a particular term times the inverse document frequency, which is how rare the term is. 

In [62]:
cv_tfidf = TfidfVectorizer()
X_tfidf = cv_tfidf.fit_transform(corpus).toarray()
tf_idf = pd.DataFrame(X_tfidf, columns = cv_tfidf.get_feature_names())



In [63]:
#tf_idf.head()

In [64]:
tf_idf.iloc[:, : 100]

Unnamed: 0,00,000,01,05,08,09,10,100,1000,1000x,100s,100yds,101,102,10am,10h,10percenttothebigguy,10pm,10th,11,118,1199,11thhour,12,12deoctubre,...,215,216,22,23,23pm,24,240,246,25,256,25th,26,261,271,28,29,292,29m,2a,2d,2day,2ndwin,2pro,30,30k
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
