# Notebook to scrape tweets containing specific keywords

This notebook will ... . Running this notebook requires an internet connection and a twitter developer account.

Note that this must be run locally using the dev version of snscrape as found in environment.yml (running via a local runtime on google colab will not work)

To run jupyter notebook using a particular environment do in the terminal: 

<code>conda install -c anaconda ipykernel<code>
<code>python -m ipykernel install --user --name=env name<code>
    
Then select the environment from the kernel dropdown menu

In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from tqdm import tqdm

In [2]:
# Just pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


Tweets are collected from 23:59 on the date in question until the number of tweets to download is reached. It is best then to get the same number of tweets each day. Note that there are **A LOT** of tweets about Ukraine now (obviously); about 10 per second even at midnight (in English)! Assuming this rate of tweeting is constant, which it probably isn't, we would have about 864000 tweets per day. 

In [3]:
# A function to create a pair of lists for days to search
def create_date_lists(since_initial, until_final):
    """
    Creates a a pair of lists for since and until dates of the form
    since_list = ['2022-02-24', '2022-02-25', '2022-02-26']
    until_list = ['2022-02-25', '2022-02-26', '2022-02-27']
    """
    import datetime

    diff = datetime.datetime.strptime(until_final, "%Y-%m-%d") - datetime.datetime.strptime(since_initial, "%Y-%m-%d")
    diff = diff.days

    since_initial_datetime = datetime.datetime.strptime(since_initial, "%Y-%m-%d")
    until_initial_datetime = since_initial_datetime + datetime.timedelta(days=1)
    until_initial_datetime = datetime.datetime.strftime(until_initial_datetime, "%Y-%m-%d")

    diff = datetime.datetime.strptime(until_final, "%Y-%m-%d") - datetime.datetime.strptime(since_initial, "%Y-%m-%d")
    diff = diff.days

    since_list = []
    until_list = []
    for day in range(diff):
        since_plus_day = since_initial_datetime + datetime.timedelta(days=day)
        since_plus_day = datetime.datetime.strftime(since_plus_day, "%Y-%m-%d")
        since_list.append(since_plus_day)

        until_plus_day = since_initial_datetime + datetime.timedelta(days=day+1)
        until_plus_day = datetime.datetime.strftime(until_plus_day, "%Y-%m-%d")
        until_list.append(until_plus_day)
        
    return since_list, until_list

Now we can download tweets and put them into a dataframe. We can download tweets in English, Russian and Ukrainian, here for 'ukraine', 'украина' (Ukraine in Russian) or 'україни' (Ukraine in Ukrainian).

In [25]:
# Each word seperated by a space is treated as a unique keyword here
keywords_en = 'mcdonalds russia' 
keywords_ru = 'макдональдс россия'
keywords_uk = 'макдональдс росія'

# We can get a number of tweets from each day 
num_tweets = 5000 # Per day
start_date = '2022-02-24'
end_date = '2022-03-17'

# Use our function to get a pair of lists for each day
since_list, until_list = create_date_lists(start_date, end_date)

def scrape_tweets_by_keywords(keyword, lang, since_list, until_list):
    # Loop through each since and until date in the since and until lists, create a dataframe for each day and stitch them together
    tweets_lang_df = pd.DataFrame(columns=['Datetime', 'Tweet Id', 'Text', 'Username', 'Like Count', 'Display Name', 'Language'])
    for day, (since,until) in enumerate(list(zip(since_list, until_list))):
        print("Day:", day)
        tweets_list = []
        for i,tweet in enumerate(tqdm(sntwitter.TwitterSearchScraper(keyword + ' since:' + since + ' until:' + until + ' lang:' + lang).get_items())):
            if i > num_tweets:
                break
            tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username, tweet.likeCount, tweet.user.displayname, tweet.lang])

        # Creating a dataframe from the tweets list above
        tweets_lang_day_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username', 'Like Count', 'Display Name', 'Language'])
        
        # Stitch the daily dataframes together
        tweets_lang_df = pd.concat([tweets_lang_df, tweets_lang_day_df])
        tweets_lang_df.reset_index(drop=True, inplace=True)
        
    return tweets_lang_df 


tweets_en_df = scrape_tweets_by_keywords(keywords_en, 'en', since_list, until_list)
tweets_ru_df = scrape_tweets_by_keywords(keywords_ru, 'ru', since_list, until_list)
tweets_uk_df = scrape_tweets_by_keywords(keywords_uk, 'uk', since_list, until_list)
tweets_en_df.head(1000)

Day: 0


137it [00:13, 10.10it/s]


Day: 1


105it [00:19,  5.32it/s]


Day: 2


94it [00:17,  5.25it/s]


Day: 3


111it [00:07, 14.32it/s]


Day: 4


96it [00:08, 10.77it/s]


Day: 5


186it [00:18,  9.93it/s]


Day: 6


241it [00:13, 18.17it/s]


Day: 7


294it [00:15, 18.39it/s]


Day: 8


1348it [00:51, 25.96it/s]


Day: 9


2171it [01:30, 23.94it/s]


Day: 10


2202it [01:24, 26.11it/s]


Day: 11


1907it [01:05, 29.29it/s]


Day: 12


5001it [02:54, 28.59it/s]


Day: 13


2313it [01:21, 28.45it/s]


Day: 14


909it [00:34, 26.50it/s]


Day: 15


556it [00:19, 28.06it/s]


Day: 16


328it [00:12, 25.86it/s]


Day: 17


323it [00:12, 26.17it/s]


Day: 18


326it [00:12, 25.20it/s]


Day: 19


224it [00:08, 25.44it/s]


Day: 20


233it [00:08, 27.73it/s]


Day: 0


0it [00:01, ?it/s]


Day: 1


0it [00:01, ?it/s]


Day: 2


0it [00:01, ?it/s]


Day: 3


1it [00:01,  1.22s/it]


Day: 4


0it [00:01, ?it/s]


Day: 5


1it [00:01,  1.49s/it]


Day: 6


0it [00:01, ?it/s]


Day: 7


0it [00:01, ?it/s]


Day: 8


0it [00:01, ?it/s]


Day: 9


1it [00:01,  1.31s/it]


Day: 10


0it [00:01, ?it/s]


Day: 11


0it [00:01, ?it/s]


Day: 12


3it [00:01,  1.84it/s]


Day: 13


10it [00:01,  6.69it/s]


Day: 14


1it [00:01,  1.73s/it]


Day: 15


0it [00:01, ?it/s]


Day: 16


0it [00:01, ?it/s]


Day: 17


2it [00:01,  1.34it/s]


Day: 18


0it [00:01, ?it/s]


Day: 19


2it [00:01,  1.45it/s]


Day: 20


0it [00:01, ?it/s]


Day: 0


0it [00:01, ?it/s]


Day: 1


0it [00:01, ?it/s]


Day: 2


0it [00:01, ?it/s]


Day: 3


0it [00:01, ?it/s]


Day: 4


0it [00:01, ?it/s]


Day: 5


0it [00:01, ?it/s]


Day: 6


0it [00:01, ?it/s]


Day: 7


0it [00:01, ?it/s]


Day: 8


0it [00:01, ?it/s]


Day: 9


0it [00:01, ?it/s]


Day: 10


0it [00:01, ?it/s]


Day: 11


1it [00:01,  1.45s/it]


Day: 12


1it [00:01,  1.67s/it]


Day: 13


1it [00:01,  1.64s/it]


Day: 14


1it [00:01,  1.28s/it]


Day: 15


0it [00:01, ?it/s]


Day: 16


0it [00:01, ?it/s]


Day: 17


0it [00:01, ?it/s]


Day: 18


0it [00:02, ?it/s]


Day: 19


0it [00:01, ?it/s]


Day: 20


0it [00:01, ?it/s]


Unnamed: 0,Datetime,Tweet Id,Text,Username,Like Count,Display Name,Language
0,2022-02-24 23:50:15+00:00,1496995923376762929,"There are over 850 McDonalds locations in Russia, and Russia is getting sanctioned. How does McDonalds - an American company - legally opportune in a country under these sanctions? Sounds like they just lost @McDonaldsCorp https://t.co/A5WwPfh3Gc",honestduane,1,(((Duane))) - 🧙‍♂️🖖🦁,en
1,2022-02-24 23:10:35+00:00,1496985942456139777,#StandWithUkraine sanctions of Russia that do not cut off the sale of gas and oil are the same as ending the sale of salads at #McDonalds's. This is their economy and they can be put to their knee.,PZacsek,0,ambiguousBIG,en
2,2022-02-24 23:00:44+00:00,1496983462263328774,"@mrvintageestate For the first time, two countries with McDonalds in them have gone to war.\n\nI can't believe that's a legitimate sentence.\n\n#ukraine #russia #ukrainerussiaconflict https://t.co/IEl0cvkPUw",Wakato_owo,5,"Wakato, Mlarmch!!",en
3,2022-02-24 22:59:41+00:00,1496983198978519050,@Cisco @Apple @Microsoft @MDLZ @PepsiCo @McDonalds @Boeing @fordm @exxonmobil \nGet out of Russia. https://t.co/qyb1hROGWg,KaMac76,0,🌍KaterinaVonTramp😷🌊,en
4,2022-02-24 22:57:27+00:00,1496982636996317184,@lotus12282633 @DonnaJo1002 @fab4screamie @Valkyrry @PattiKimble @ShercoSherrill @network_gal @PRubesa @NoreneD39766667 @perrigoat @Swags_D17 @grupp_arthur @1stCAV_AM_1972 @AndrewTRei1 @JonMesser7 @elisabeth_pal @kim_f86 @Robyn_CherCrew @TheAwkwardLefty @dorismelmore1 @Jeeneree @GlennVotesBlue @mitch_gorman @JameJoh86430339 @TrishBeMe @AveryBa68752542 @unewmeb41 @WebbL03 @MoreWhit @dac37799066 @nadiepetah @rlongman1 @MVRaiderFan @debbie_american @Cherfan4ever1 @Meiguo_nan @gregdemarco1964 @TXhoneydew @RestlessNews @judeaa @markie_kmarky8 @kasseykrammer @SaulTgh @boris3324 @christraynor @Impeach66051636 @yazzyqz @Itridpm @DanerE_57 @jheil American companies need to get out of Russia — McDonalds &amp; Coke &amp; others,agavecorn,3,agave corn 💙#TheCountryIsSickFromLackOfJustice,en
5,2022-02-24 22:50:09+00:00,1496980798234402820,@FinnLove11 @adinqiang @pragmatictexan @CNN We asked Russia to Join Nato they said no. Yes America is bad we put our Nato Cannons on your territory and force all NATO countries to bow down and follow us. We vote for you too. We make you have Netflix McDonalds against your will. We are bad /s,Lutraboy1,0,Lutraboy,en
6,2022-02-24 22:49:09+00:00,1496980545338998784,Hey @McDonalds are you still in Russia?,BartoszBauza,1,Bartosz Bauza,en
7,2022-02-24 22:44:15+00:00,1496979312859181057,Russia invaded the Ukraine because McDonalds wont bring back the McRib,DownLoadedHats,3,Noah Wingfield,en
8,2022-02-24 22:37:05+00:00,1496977508339703809,@lisanandy @McDonalds @Starbucks &amp; @amazon Should pull out of Russia and Belarus. That would cause uproar amongst Russians and Belarusians. Think they would prefer to have these things rather than #MadVlad,t5vdubb,0,VDubstar,en
9,2022-02-24 22:33:54+00:00,1496976709626777600,@TraceyKent My Father was part of the McDonalds team that developed Russia. He cannot stand Vodka or Balalaika music to this day!,ChefWendyB,1,Nova Scotian Ontarian,en


### Getting user data

This is done with tweepy. To make this work you will need to sign up for a twitter developers account and put your keys into a key config file. User data which we will want includes:
- Number of followers
- Number following (called friends in the API)
- Potentially the names of follows
- Potentially the number following
- Potentially the location
Whilst it would be great to get the names of at least some of a users followers and friends, getting them is severely limited by tweepys limit of 15 friends/followers requests of a user every 15 minutes. We can however get the number of friends/followers which has a much higher rate limit, supposedly around 300 requests every 15 minutes but it actually appears to be higher in practice, somewhere around 900/10 mins but this fluctuates.

In [32]:
import json
import tweepy

# Load my API keys
with open("../twitter_keys_config.json") as cfg_file:
    config = json.load(cfg_file)
    
# Setup tweepy API
consumer_key = config["API_key"]
consumer_secret = config["API_secret"]
access_token = config["access_token"]
access_token_secret = config["access_secret"]

# authorization of consumer key and consumer secret
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

# set access to user's access key and access secret
auth.set_access_token(access_token, access_token_secret)

# calling the api
api = tweepy.API(auth, wait_on_rate_limit = True)

In [37]:
# Get number of followers function
def get_num_followers_and_friends(username):
    user = api.get_user(screen_name = username)
    followers_count = user.followers_count
    friends_count = user.friends_count
    
    return followers_count, friends_count

# Iterrate through a dataframe and add columns for num friends and num followers
def add_num_followers_and_friends(tweets_df): 
    for idx, tweet in tqdm(tweets_df.iterrows()):
        username = tweet['Username']

        try:
            num_followers, num_friends = get_num_followers_and_friends(username)

        # Exception for if tweepy cuts you off for too many requests
        # EVEN though wait_on_rate_limit = True should address this!!
        except tweepy.errors.TweepyException:
            wait_time = 60
            print("Tweepy rate limit reached and native tweepy solution failed, waiting", wait_time, "s")
            time.sleep(wait_time)
            
        # Exception for if an account has been deleted since scraping tweets
        except tweepy.errors.NotFound:
            print("Username", username, "not found, skipping")
            num_followers, num_friends = 0,0 # TODO Make this zero or some error value?
            
        
        tweets_df.loc[idx, 'Num Followers'] = num_followers
        tweets_df.loc[idx, 'Num Friends'] = num_friends
    
    return tweets_df

tweets_en_df = add_num_followers_and_friends(tweets_en_df)
tweets_ru_df = add_num_followers_and_friends(tweets_ru_df)
tweets_uk_df = add_num_followers_and_friends(tweets_uk_df)

900it [06:07,  2.94it/s]Rate limit reached. Sleeping for: 533


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


1801it [21:53,  2.86it/s]Rate limit reached. Sleeping for: 548


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


2702it [37:31,  2.81it/s] Rate limit reached. Sleeping for: 571


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


3603it [53:37,  2.85it/s] Rate limit reached. Sleeping for: 567


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


4504it [1:09:18,  3.06it/s] Rate limit reached. Sleeping for: 587


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


5405it [1:25:38,  2.80it/s] Rate limit reached. Sleeping for: 569


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


6306it [1:42:14,  2.91it/s] Rate limit reached. Sleeping for: 572


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


7207it [1:58:11,  3.06it/s] Rate limit reached. Sleeping for: 577
8107it [2:13:24,  2.89it/s] Rate limit reached. Sleeping for: 566


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


9008it [2:29:13,  2.77it/s] Rate limit reached. Sleeping for: 578


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


9909it [2:45:40,  3.03it/s] Rate limit reached. Sleeping for: 553


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


10272it [2:58:13,  2.85it/s]

Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


10810it [3:02:39,  2.98it/s]Rate limit reached. Sleeping for: 496
11710it [3:16:06,  2.76it/s] Rate limit reached. Sleeping for: 589


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


12530it [3:32:02,  2.89it/s] 

Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


12611it [3:33:45,  2.31it/s]Rate limit reached. Sleeping for: 491


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


13512it [3:48:48,  1.94it/s] Rate limit reached. Sleeping for: 549
14392it [4:03:25,  2.68it/s] 

Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


14412it [4:04:32,  2.76it/s]Rate limit reached. Sleeping for: 506


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


15313it [4:19:04,  3.04it/s] Rate limit reached. Sleeping for: 596


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


16214it [4:35:13,  2.14it/s] Rate limit reached. Sleeping for: 588


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


17115it [4:51:33,  3.06it/s] Rate limit reached. Sleeping for: 570
18015it [5:07:39,  2.79it/s] Rate limit reached. Sleeping for: 506
18084it [5:16:30,  2.97it/s] 

Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


18915it [5:22:19,  3.08it/s]Rate limit reached. Sleeping for: 527


Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


18976it [5:32:29,  2.25it/s] 

Tweepy rate limit reached and native tweepy solution failed, waiting 60 s


19105it [5:34:13,  1.05s/it]
79it [00:26,  2.94it/s]
8it [00:02,  2.91it/s]


In [36]:
dir(tweepy.errors)

['BadRequest',
 'Forbidden',
 'HTTPException',
 'NotFound',
 'TooManyRequests',
 'TweepyException',
 'TwitterServerError',
 'Unauthorized',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'requests']

### Save dataframes

In [46]:
tweets_en_df.to_csv('data/tweets_raw_en_df_' + keywords_en.replace(' ', '_') + str(num_tweets) + 'dailytweets_' + start_date + '_to_' + end_date + '.csv')
tweets_ru_df.to_csv('data/tweets_raw_ru_df_' + keywords_ru.replace(' ', '_') + str(num_tweets) + 'dailytweets_' + start_date + '_to_' + end_date + '.csv')
tweets_uk_df.to_csv('data/tweets_raw_uk_df_' + keywords_uk.replace(' ', '_') + str(num_tweets) + 'dailytweets_' + start_date + '_to_' + end_date + '.csv')

In [45]:
keywords_en.replace(' ', '_')

'mcdonalds_russia'