In [1]:
import nltk
from nltk.corpus import stopwords
import json
import tweepy
from collections import defaultdict
import time
import pandas as pd
from nltk.tokenize import TweetTokenizer
import random

In [2]:
#uncomment and run this cell
#! python -mpip install gensim

In [3]:
bearer = "ENTER YOUR BEARER TOKEN"

In [4]:
client = tweepy.Client(bearer_token=bearer, wait_on_rate_limit=True)

def get_tweets(client, input_query, n=1000):
    """
    This collects tweets and associated metadata as well as including associating
    users metadata with the tweet.
    """
    page_size = 100
    pages = n // 100 + 1
    i = 0
    for tweet_batch in tweepy.Paginator(client.search_recent_tweets, input_query,
                                  tweet_fields=["created_at", "public_metrics", "entities"],
                                   expansions=["author_id"],
                                  user_fields=["username", "public_metrics"],
                                  max_results=page_size, limit=pages):
        # user data is sent in a package alongside the returned tweets
        user_lookup = {u.id: u.data for u in tweet_batch.includes["users"]}
        for tweet in tweet_batch.data:
            data = tweet.data
            # 
            data["author"] = user_lookup[tweet.author_id]
            i += 1
            yield tweet
            # stop exactly at the nth tweet otherwise the api will return the rest
            # of the data from the same page.
            if i == n:
                return

# Collecting Tweets for Virginia Tech, Harvard, and Stanford

Please note that we use three different topics in this case just for clarity. You can collect a large set of tweets from Virginia Tech and still be able to do the topic modeling as follows.  This will probably take some time to run.

In [5]:
input_queries = ['"Virginia Tech"', "Harvard", "Stanford"]

dataset = pd.DataFrame()
download_tweet_count = 1000
seen = {}
for input_query in input_queries:
    #Download, skipping retweets, look for english
    input_query_nort = "{} -is:retweet lang:en".format(input_query)
    print(input_query)
    q_dataset = []
    for i, tweet in enumerate(get_tweets(client, input_query_nort)):
        data = tweet.data
        data["topic"] = input_query
        q_dataset += [data]
        if (i + 1) % 100 == 0:
            print("processed {} tweets: saved {}".format(i + 1, len(q_dataset)))
    dataset = pd.concat([dataset, pd.json_normalize(q_dataset)])
dataset.to_json("tweets-lda.jsonl", orient="records", lines=True)


"Virginia Tech"
processed 100 tweets: saved 100
processed 200 tweets: saved 200
processed 300 tweets: saved 300
processed 400 tweets: saved 400
processed 500 tweets: saved 500
processed 600 tweets: saved 600
processed 700 tweets: saved 700
processed 800 tweets: saved 800
processed 900 tweets: saved 900
processed 1000 tweets: saved 1000
Harvard
processed 100 tweets: saved 100
processed 200 tweets: saved 200
processed 300 tweets: saved 300
processed 400 tweets: saved 400
processed 500 tweets: saved 500
processed 600 tweets: saved 600
processed 700 tweets: saved 700
processed 800 tweets: saved 800
processed 900 tweets: saved 900
processed 1000 tweets: saved 1000
Stanford
processed 100 tweets: saved 100
processed 200 tweets: saved 200
processed 300 tweets: saved 300
processed 400 tweets: saved 400
processed 500 tweets: saved 500
processed 600 tweets: saved 600
processed 700 tweets: saved 700
processed 800 tweets: saved 800
processed 900 tweets: saved 900
processed 1000 tweets: saved 1000


In [6]:
dt = pd.read_json("tweets-lda.jsonl", lines=True)

#drop unneed metadata and turn it into a list, you can apply this to other listlike fields
dt["entities.hashtags"] = dt["entities.hashtags"].apply(lambda r: [] if not r else r)
dt["entities.hashtags"] = dt["entities.hashtags"].apply(lambda r: [h["tag"] for h in r])
dt

Unnamed: 0,text,edit_history_tweet_ids,id,created_at,author_id,topic,entities.mentions,entities.annotations,public_metrics.retweet_count,public_metrics.reply_count,...,author.id,author.name,author.public_metrics.followers_count,author.public_metrics.following_count,author.public_metrics.tweet_count,author.public_metrics.listed_count,author.username,entities.urls,entities.hashtags,entities.cashtags
0,@RSIRob @scs_real He’s at Virginia Tech now. O...,[1637901189865177096],1637901189865177088,2023-03-20 19:37:29+00:00,1391907070031630336,"""Virginia Tech""","[{'start': 0, 'end': 7, 'username': 'RSIRob', ...","[{'start': 26, 'end': 38, 'probability': 0.979...",0,0,...,1391907070031630336,Grant Montgomery 2.0 🇺🇸🇮🇱🇯🇵,4508,4510,10791,1,monsgomeric,,[],
1,Virginia Tech women's basketball advances to t...,[1637899898627448833],1637899898627448832,2023-03-20 19:32:21+00:00,45960001,"""Virginia Tech""",,"[{'start': 0, 'end': 18, 'probability': 0.8393...",1,0,...,45960001,Collegiate Times,20332,327,18488,448,CollegiateTimes,"[{'start': 112, 'end': 135, 'url': 'https://t....",[],
2,The NCAA Women's Basketball Tournament banned ...,[1637899418279059458],1637899418279059456,2023-03-20 19:30:26+00:00,237286926,"""Virginia Tech""",,"[{'start': 4, 'end': 37, 'probability': 0.5539...",0,0,...,237286926,96.3 WROV,896,529,16209,9,ROVRocks,"[{'start': 136, 'end': 159, 'url': 'https://t....",[],
3,DaVinci Bead Virginia Tech – Jewelry Bracelet ...,[1637899291661598726],1637899291661598720,2023-03-20 19:29:56+00:00,3238156627,"""Virginia Tech""",,,0,0,...,3238156627,Wear Your Own Techs,3684,3768,139373,36,wyot23,"[{'start': 104, 'end': 127, 'url': 'https://t....",[],
4,Virginia Tech Fans Sing Metallica’s ‘Enter San...,[1637898477525950464],1637898477525950464,2023-03-20 19:26:42+00:00,43113373,"""Virginia Tech""",,"[{'start': 0, 'end': 12, 'probability': 0.912,...",0,0,...,43113373,WCYY,3996,679,17159,88,WCYY,"[{'start': 84, 'end': 107, 'url': 'https://t.c...",[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,@mikepompeo You think Stanford has rednecks th...,[1637881583066652672],1637881583066652672,2023-03-20 18:19:34+00:00,1256022482643046400,Stanford,"[{'start': 0, 'end': 11, 'username': 'mikepomp...","[{'start': 22, 'end': 29, 'probability': 0.731...",0,0,...,1256022482643046400,JT🌊,260,592,492,1,JThappadude,,[],
2996,#空放 #天津 #兰州 #大连 Stanford Brown Walter Oni...,[1637881571901595648],1637881571901595648,2023-03-20 18:19:31+00:00,2714020032,Stanford,,,0,0,...,2714020032,同城 约 啪 ·上海深圳成都苏州无锡南通盐城常州南京佛山东莞广州武汉青岛济南济宁潍坊温州老九...,0,0,490,0,wessfd,"[{'start': 63, 'end': 86, 'url': 'https://t.co...","[空放, 天津, 兰州, 大连]",
2997,@ChrisCoble @RichardGrenell Can keep sending D...,[1637881523595558914],1637881523595558912,2023-03-20 18:19:20+00:00,1179965994346917888,Stanford,"[{'start': 0, 'end': 11, 'username': 'ChrisCob...","[{'start': 167, 'end': 178, 'probability': 0.5...",0,0,...,1179965994346917888,United & Indivisable,74,396,12606,1,Happyboston1,"[{'start': 218, 'end': 241, 'url': 'https://t....",[],
2998,@KeenanPeachy I’m going to tell my kid to writ...,[1637881455861702672],1637881455861702656,2023-03-20 18:19:04+00:00,3956516777,Stanford,"[{'start': 0, 'end': 13, 'username': 'KeenanPe...","[{'start': 162, 'end': 169, 'probability': 0.8...",0,0,...,3956516777,Hossein Khorashadi,19,185,3436,1,hkhorashadi1,,[],


In [7]:
dt[['created_at','text','entities.hashtags','author.username','author.public_metrics.followers_count','topic']].sample(10)

Unnamed: 0,created_at,text,entities.hashtags,author.username,author.public_metrics.followers_count,topic
1064,2023-03-20 19:25:55+00:00,@INTPhilosopher @emrazz Harvard is asking the ...,[],79Merlot,344,Harvard
2534,2023-03-20 18:59:53+00:00,Rory Margaret Caroline Tony #长沙资源 #长沙 Stanford...,"[长沙资源, 长沙]",ChirafisiSummer,0,Stanford
1438,2023-03-20 17:47:06+00:00,"Ron graduated from Yale 💀🦴 &amp; Harvard, and ...",[],dade_only,55,Harvard
2679,2023-03-20 18:48:35+00:00,Bridget Bob Gale Ruskin Freda Grant Stanford M...,[布里斯托],josiahpuglas_2,0,Stanford
346,2023-03-19 23:22:48+00:00,Winner of Toledo-Lady Vols on Monday night wil...,[],TeresaMWalker,14856,"""Virginia Tech"""
2209,2023-03-20 19:24:36+00:00,@FNCOriginals @FoxNews No judges should accept...,[],cyberiano42,2,Stanford
605,2023-03-19 14:51:14+00:00,West Virginia and Virginia Tech are the latest...,[],jakeweingarten,34510,"""Virginia Tech"""
14,2023-03-20 18:53:01+00:00,Merrimack transfer Ziggy Reid tells TPR that h...,[],ThePortalReport,10086,"""Virginia Tech"""
2214,2023-03-20 19:24:17+00:00,Cedric Child Muriel Sheridan #青岛资源 #青岛 Stanfor...,"[青岛资源, 青岛]",CottinghamRoss,0,Stanford
1338,2023-03-20 18:15:03+00:00,With the Big Ten and PWR as tight as they were...,[],YostBuilt,3260,Harvard


In [8]:
dt["text"][1]

"Virginia Tech women's basketball advances to the Sweet 16 of the NCAA Tournament for the first time since 1999.\nhttps://t.co/mi4tyVYRPI"

# Processing only the text of the tweets

In [9]:
all_docs = dt['text'].values
print(all_docs[0])

@RSIRob @scs_real He’s at Virginia Tech now. Ohio State didn’t work out.


# Word Tokenization using TweetTokenizer
This tokenizer is customized for tokenizing tweet data. Try using a different tokenizer to see how the result of your LDA will change.

In [10]:
import string
exclude = set(string.punctuation)
tokenized = []
tokenizer = TweetTokenizer()
for doc in all_docs:
    tokens = tokenizer.tokenize(doc.lower())
    tokenized.append(''.join([ch for ch in ' '.join(tokens) if ch not in exclude]).split())
print(tokenized[1])

['virginia', 'tech', 'womens', 'basketball', 'advances', 'to', 'the', 'sweet', '16', 'of', 'the', 'ncaa', 'tournament', 'for', 'the', 'first', 'time', 'since', '1999', 'httpstcomi4tyvyrpi']


# Stop word removal
Certain parts of English speech, like conjunctions ("in", "for") are meaningless to a topic model. These terms are called stop words and need to be removed from our token list.

In [11]:
sws = set(stopwords.words('english'))
sws.add('rt') # Tweet specific stop-words
sws.add("…") 
sws_removed = []
for j,sent in enumerate(tokenized):
    sws_removed.append([i for i in sent 
                        if i not in sws   # drop stopwords 
                             and len(i) > 2  # drop words of insig length
                             and (not i.startswith("http"))])  #drop links
print(sws_removed[1])

['virginia', 'tech', 'womens', 'basketball', 'advances', 'sweet', 'ncaa', 'tournament', 'first', 'time', 'since', '1999']


# Gensim Library
The result of our cleaning stage is tweets, a tokenized, stop-removed list of words from a single tweet. We looped through all our documents and appended each one to our sws_removed variable. So now sws_removed is a list of lists, one list for each of our original tweets.

To generate an LDA model, we need to understand how frequently each term occurs within each document. To do that, we need to construct a document-term matrix with a package called gensim:

In [12]:
from gensim import corpora, models
dictionary = corpora.Dictionary(sws_removed)
dictionary.filter_extremes(no_below=5, no_above=0.3)
dictionary.compactify()

The Dictionary() function traverses data, assigning a unique integer id to each unique token while also collecting word counts and relevant statistics. To see each token's unique integer id, try print(dictionary.token2id).

Next, our dictionary must be converted into a bag-of-words:

In [13]:
corpus = [dictionary.doc2bow(text) for text in sws_removed]

The doc2bow() function converts dictionary into a bag-of-words. The result, corpus, is a list of vectors equal to the number of tweets. In each tweet vector is a series of tuples. As an example, print(corpus[2]) results in the following:

In [14]:
print(corpus[1])

[(2, 1), (3, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]


This list of tuples represents our first tweet. The tuples are (term ID, term frequency) pairs. doc2bow() only includes terms that actually occur: terms that do not occur in a tweet will not appear in that tweet's vector.

# LDA model
corpus is a document-term matrix and now we are ready to generate an LDA model:

In [15]:
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=20)

The LdaModel class is described in detail in the gensim documentation.
Parameters used in our example:
num_topics: required. An LDA model requires the user to determine how many topics should be generated. Our document set is small, so we’re only asking for three topics.
id2word: required. The LdaModel class requires our previous dictionary to map ids to strings.
passes: optional. The number of laps the model will take through corpus. The greater the number of passes, the more accurate the model will be. A lot of passes can be slow on a very large corpus.

# Examining the results
Our LDA model is now stored as ldamodel. We can review our topics with the print_topic and print_topics methods:

In [16]:
ldamodel.print_topics(num_topics=5, num_words=5)

[(0,
  '0.031*"tech" + 0.031*"virginia" + 0.019*"state" + 0.009*"new" + 0.007*"college"'),
 (1,
  '0.043*"virginia" + 0.043*"tech" + 0.013*"game" + 0.012*"ncaa" + 0.009*"hokies"'),
 (2,
  '0.048*"harvard" + 0.013*"law" + 0.009*"brown" + 0.009*"like" + 0.008*"school"')]

What does this mean? Each generated topic is separated by a comma. Within each topic are the five most probable words to appear in that topic. Even though our document set is small the model is reasonable.

In [17]:
for i in range(0, len(corpus), 500):  
    topics = ldamodel.get_document_topics(corpus[i])
    topics = sorted(topics, key=lambda x: -x[1])
    print("{}: {}\n".format(topics, " ".join(sws_removed[i])))

[(0, 0.71576947), (2, 0.21987227), (1, 0.064358294)]: rsirob scsreal virginia tech ohio state work

[(1, 0.48604637), (0, 0.42221296), (2, 0.09174066)]: elizabeth kitley currently tied ieva kublina hokies alltime blocks leader 256 needs one break virginia tech 551 nonconference foes cassell coliseum kenny brooks techs seeking secondever sweet berth 1999

[(1, 0.7421453), (2, 0.18004228), (0, 0.077812426)]: harvard grime release pro carpet cleaning prespray traffic lane cleaner case ebay

[(2, 0.9382218), (0, 0.031068841), (1, 0.030709373)]: excellent possible explanation desantiss actions statement regarding president trump would navy jag harvard guy ron parttime bona fide fed

[(1, 0.9030132), (2, 0.04933226), (0, 0.047654513)]: stanford arthur vito haydn 青岛资源 aaron london

[(0, 0.903958), (2, 0.048402734), (1, 0.047639202)]: louise harrison stanford smith 成都旅游 raymond grote



Now let's try with two topics:

In [18]:
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=20)

In [19]:
ldamodel.print_topics(num_topics=2, num_words=5)

[(0,
  '0.038*"harvard" + 0.012*"law" + 0.009*"school" + 0.008*"university" + 0.007*"like"'),
 (1,
  '0.049*"virginia" + 0.049*"tech" + 0.014*"state" + 0.011*"ncaa" + 0.011*"hokies"')]