In [1]:
import pandas as pd
import tweepy
import json

In [2]:
theme = 'WorldCup'

In [3]:
with open("credentials.json") as f:
    credentials = json.load(f)

In [4]:
client = tweepy.Client(bearer_token=credentials['BEARER_TOKEN'])

In [5]:
tweets_df = pd.DataFrame()
users_df = pd.DataFrame()

In [6]:
queries = ['#QatarWorldCup2022', '#Qatar2022', '#FIFAWorldCup']

In [7]:
indices = pd.read_csv(f'./data/{theme}/tweets.csv', usecols=['id'])

In [8]:
params = {
    'max_results': 100,
    'limit': 200,
    'tweet_fields': ['id', 'text', 'public_metrics', 'author_id', 'created_at', 'lang'],
    'user_fields': ['username', 'created_at', 'public_metrics', 'protected', 'verified'],
    'expansions': ['author_id', 'entities.mentions.username'],
    'since_id': indices.max()
}

In [9]:
tweet_list = list()
users_list = list()

In [10]:
for query in queries:
    try:
        for chunk in tweepy.Paginator(client.search_recent_tweets, query=f'{query} -is:retweet', **params):
            for tweets in chunk.data:
                tweet = [query,
                         tweets.id,
                         tweets.author_id,
                         tweets.text,
                         tweets.public_metrics['like_count'],
                         tweets.public_metrics['reply_count'],
                         tweets.public_metrics['retweet_count'],
                         tweets.public_metrics['quote_count'],
                         tweets.created_at,
                         tweets.lang]

                mentions = None
                if tweets.entities is not None:
                    mentions = [mention['id'] for mention in tweets.entities['mentions']]
                tweet.append(mentions)

                tweet_list.append(tweet)

            for user in chunk.includes['users']:
                users_list.append([user.id,
                                   user.username,
                                   user.created_at,
                                   user.public_metrics['followers_count'],
                                   user.public_metrics['following_count'],
                                   user.public_metrics['tweet_count'],
                                   user.protected,
                                   user.verified])

        temp_tweets = pd.DataFrame(tweet_list,
                                   columns=['hashtag', 'id', 'author_id', 'text', 'like_count', 'reply_count',
                                            'retweet_count',
                                            'quote_count',
                                            'created_at', 'lang', 'mentions'])
        temp_users = pd.DataFrame(users_list,
                                  columns=['id', 'username', 'created_at', 'followers_count', 'following_count',
                                           'tweet_count', 'protected', 'verified'])

        tweets_df = pd.concat([tweets_df, temp_tweets])
        users_df = pd.concat([users_df, temp_users])
    except tweepy.TooManyRequests:
        print('WARNING: Tweet download stopped due to TooManyRequest exception. Wait 15 minutes...')
        break



In [11]:
tweets_df

Unnamed: 0,hashtag,id,author_id,text,like_count,reply_count,retweet_count,quote_count,created_at,lang,mentions
0,#QatarWorldCup2022,1596459905355218950,144659936,@GhanaBlackstars need to win the next 6 games ...,0,0,0,0,2022-11-26 11:04:36+00:00,en,[1444999898429075456]
1,#QatarWorldCup2022,1596459839676289025,1046295976468611073,@ChaifullyYours Australia\n\n@ChaifullyYours #...,0,0,0,0,2022-11-26 11:04:21+00:00,en,"[779652781397291009, 779652781397291009, 13578..."
2,#QatarWorldCup2022,1596459832286105600,1544762824894939136,@k_i_j99 السعوديه 🇸🇦١\nبولندا 🇵🇱 0\nمااعرف لعي...,0,0,0,0,2022-11-26 11:04:19+00:00,ar,[371234863]
3,#QatarWorldCup2022,1596459821129404416,2171865272,Let's predict the result of #BELMAR ‼︎\n\n#FIF...,0,0,0,0,2022-11-26 11:04:16+00:00,en,
4,#QatarWorldCup2022,1596459760085536769,1596207239060152321,بتوفيق بإذن لله \nللّجار الشقيق والعزيز علي قل...,0,0,0,0,2022-11-26 11:04:02+00:00,ar,
...,...,...,...,...,...,...,...,...,...,...,...
39472,#Qatar2022,1596319413795684352,1579998556026724352,#Qatar2022 #FIFAWorldCup #FIFAWorldCup2022 #Wo...,0,0,0,0,2022-11-26 01:46:21+00:00,qht,
39473,#Qatar2022,1596319399707029504,1568351925891899397,Mañana juega ARGENTINAA \n#Qatar2022,0,0,0,0,2022-11-26 01:46:17+00:00,es,
39474,#Qatar2022,1596319397924458497,920379223881154561,Así fue el rendimiento de @sujenoel durante el...,2,0,0,0,2022-11-26 01:46:17+00:00,es,[139214841]
39475,#Qatar2022,1596319395772764160,1319577175679365120,"@FoxNews On November 17, Luc Despins took the ...",0,0,0,0,2022-11-26 01:46:16+00:00,en,[1367531]


In [12]:
users_df

Unnamed: 0,id,username,created_at,followers_count,following_count,tweet_count,protected,verified
0,144659936,KwakuAkyeampong,2010-05-16 23:53:27+00:00,385,4983,11296,False,False
1,1444999898429075456,GhanaBlackstars,2021-10-04 12:17:06+00:00,247959,120,1857,False,True
2,1046295976468611073,itsazeena,2018-09-30 07:09:34+00:00,2216,2029,63711,False,False
3,779652781397291009,ChaifullyYours,2016-09-24 12:04:43+00:00,1816,12,900,False,False
4,1357899954690269186,Rubalsandhu5,2021-02-06 03:52:44+00:00,0,1,0,False,False
...,...,...,...,...,...,...,...,...
44260,138994780,JoaquinGPeralta,2010-05-01 06:49:10+00:00,207,148,9180,False,False
44261,139214841,sujenoel,2010-05-01 22:56:06+00:00,1308,268,536,False,False
44262,1367531,FoxNews,2007-03-17 19:01:26+00:00,23138070,262,510907,False,True
44263,499044422,QuintanaRooHoy1,2012-02-21 18:21:57+00:00,16063,2277,158670,False,False


In [13]:
tweets_df.to_csv(f'./data/{theme}/tweets.csv', mode='a', index=False, header=False)
users_df.to_csv(f'./data/{theme}/users.csv', mode='a', index=False, header=False)