In [1]:
import pandas as pd
import tweepy
import json
import os.path

In [3]:
# Loading json file with required credentials
with open("credentials.json") as f:
    credentials = json.load(f)

In [4]:
# Authentication
client = tweepy.Client(bearer_token=credentials['BEARER_TOKEN'])

In [5]:
# Creating empy dataframes for tweets and users
tweets_df = pd.DataFrame()
users_df = pd.DataFrame()

In [6]:
# List of hashtags
queries = ['#QatarWorldCup2022', '#Qatar2022', '#FIFAWorldCup']

In [7]:
# Getting set of collected tweets' ids
ids = pd.read_csv(f'./data/tweets.csv', usecols=['id'])

In [8]:
# Search parameters
params = {
    'max_results': 100,
    'limit': 200,
    'tweet_fields': ['id', 'text', 'public_metrics', 'author_id', 'created_at', 'lang'],
    'user_fields': ['username', 'created_at', 'public_metrics', 'protected', 'verified'],
    'expansions': ['author_id', 'entities.mentions.username'],  # expansions needed to collect tweets' author details
    'since_id': ids.max()   # id of a last collected tweet
}

In [9]:
import time

In [10]:
# Collecting tweets for each given hashtag
for query in queries:
    try:
        # Creating empty lists for tweets and users
        tweet_list = list()
        users_list = list()

        # Getting tweets for a given hashtag without retweets
        for chunk in tweepy.Paginator(client.search_recent_tweets, query=f'{query} -is:retweet', **params):

            # Obtaining tweet's details including mentioned users' ids
            for tweets in chunk.data:
                tweet = [query,
                         tweets.id,
                         tweets.author_id,
                         tweets.text,
                         tweets.public_metrics['like_count'],
                         tweets.public_metrics['reply_count'],
                         tweets.public_metrics['retweet_count'],
                         tweets.public_metrics['quote_count'],
                         tweets.created_at,
                         tweets.lang]

                # Appending list of mentioned users' ids
                mentions = None
                if tweets.entities is not None:
                    mentions = [mention['id'] for mention in tweets.entities['mentions']]
                tweet.append(mentions)

                tweet_list.append(tweet)

            # Obtaining user's details
            for user in chunk.includes['users']:
                users_list.append([user.id,
                                   user.username,
                                   user.created_at,
                                   user.public_metrics['followers_count'],
                                   user.public_metrics['following_count'],
                                   user.public_metrics['tweet_count'],
                                   user.protected,
                                   user.verified])

        # Creating temporary dataframe out of collected tweets
        temp_tweets = pd.DataFrame(tweet_list,
                                   columns=['hashtag', 'id', 'author_id', 'text', 'like_count', 'reply_count',
                                            'retweet_count',
                                            'quote_count',
                                            'created_at', 'lang', 'mentions'])

        # Creating temporary dataframe out of collected users
        temp_users = pd.DataFrame(users_list,
                                  columns=['id', 'username', 'created_at', 'followers_count', 'following_count',
                                           'tweet_count', 'protected', 'verified'])

        # Concatenating temporary dataframes with final ones
        tweets_df = pd.concat([tweets_df, temp_tweets])
        users_df = pd.concat([users_df, temp_users])

    except tweepy.TooManyRequests:
        print('WARNING: Tweet download stopped due to TooManyRequest exception. Wait 15 minutes...')
        time.sleep(15 * 60)



In [11]:
# Displaying collected tweets
tweets_df

Unnamed: 0,hashtag,id,author_id,text,like_count,reply_count,retweet_count,quote_count,created_at,lang,mentions
0,#QatarWorldCup2022,1605584655431892995,1447603056359714817,#Deportes | El mundial de Qatar 2022 se carac...,1,0,0,0,2022-12-21 15:23:06+00:00,es,
1,#QatarWorldCup2022,1605584631612329985,1485652252002066437,Hadji: « La FRMF a fait le bon choix en parian...,0,0,0,0,2022-12-21 15:23:01+00:00,fr,
2,#QatarWorldCup2022,1605584449877549058,1447604020462370822,#Deportes | El mundial de Qatar 2022 se carac...,0,0,0,0,2022-12-21 15:22:17+00:00,es,
3,#QatarWorldCup2022,1605584342943776768,1055130038092345344,#Deportes | El mundial de Qatar 2022 se carac...,0,0,0,0,2022-12-21 15:21:52+00:00,es,
4,#QatarWorldCup2022,1605583974289375232,191106942,"📢 Popularité du couple exécutif, #QatarWorldCu...",0,0,0,0,2022-12-21 15:20:24+00:00,fr,"[296824934, 34294667]"
...,...,...,...,...,...,...,...,...,...,...,...
19955,#Qatar2022,1605037030064685057,1530154136,代表チームの強さと人口は無関係である\n#FIFAWorldCup #Qatar2022 #...,7,0,1,1,2022-12-20 03:07:02+00:00,ja,
19956,#Qatar2022,1605037015233310720,1363653333479788549,Argentine dairy industry #DitchDairy @dairy_tr...,0,0,0,0,2022-12-20 03:06:59+00:00,en,"[1370074650038910983, 1246189682150367233]"
19957,#Qatar2022,1605036970660462602,1522221998151061509,This time @TeamMessi Argentina ⚽❤✌ \n\n 📕 #Dex...,0,0,0,0,2022-12-20 03:06:48+00:00,en,[1058376110]
19958,#Qatar2022,1605036968588484609,1504760689650552863,The real GOAT 🐐 congrats LEO @TeamMessi😭🇦🇷 \n\...,0,0,0,0,2022-12-20 03:06:48+00:00,en,[1058376110]


In [12]:
# Displaying collected users
users_df

Unnamed: 0,id,username,created_at,followers_count,following_count,tweet_count,protected,verified
0,1447603056359714817,24horasYucatan,2021-10-11 16:40:53+00:00,3819,149,15217,False,False
1,1485652252002066437,MarocV_Com,2022-01-24 16:34:46+00:00,320,3,7737,False,False
2,1447604020462370822,24horasCampeche,2021-10-11 16:44:38+00:00,2473,110,13930,False,False
3,1055130038092345344,24horasqroo,2018-10-24 16:12:59+00:00,6364,1187,76743,False,False
4,191106942,BVA_France,2010-09-15 16:36:24+00:00,17606,605,7295,False,False
...,...,...,...,...,...,...,...,...
24738,1363653333479788549,Gabrielagladys5,2021-02-22 00:54:46+00:00,40,108,7815,False,False
24739,1370074650038910983,dairy_truth,2021-03-11 18:10:43+00:00,1278,428,3462,False,False
24740,1246189682150367233,MilkedFilm,2020-04-03 21:36:08+00:00,793,52,111,False,False
24741,1522221998151061509,KishanSadhashiv,2022-05-05 14:31:08+00:00,2,18,163,False,False


In [13]:
# Saving collected data
if not os.path.exists('./data/tweets.csv'):
    tweets_df.to_csv(f'./data/tweets.csv', index=False)
    users_df.to_csv(f'./data/users.csv', index=False)
else:
    tweets_df.to_csv(f'./data/tweets.csv', mode='a', index=False, header=False)
    users_df.to_csv(f'./data/users.csv', mode='a', index=False, header=False)