In [3]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timezone
import json
import os
import dotenv
import pickle
import time

In [4]:
ENV_PATH = f"..{os.sep}.env"

dotenv.load_dotenv(ENV_PATH) # This will refresh the environment variables

BEARER = os.environ["TWITTER_BEARER_TOKEN"]


In [5]:
headers = {"Authorization": f"Bearer {BEARER}"}

In [6]:
start_time = datetime(year = 2022, month = 1, day = 1).astimezone().isoformat()
end_time = datetime(year = 2022, month = 12, day = 31).astimezone().isoformat()

In [115]:
tweets_count = "https://api.twitter.com/2/tweets/counts/all"

params={
        "query":f"from:BBCNews",
        "start_time":start_time,
        "end_time":end_time,
        "granularity":"day"
        }

response = requests.get(tweets_count, params=params, headers=headers)

In [116]:
count = 0
for i in response.json()["data"]:
    count+= i['tweet_count']

In [117]:
count

2491

In [7]:
json_list = []


def get_tweets(screen_name, start_time=start_time, end_time=end_time):
    json_list = []
    search_tweets = "https://api.twitter.com/2/tweets/search/all"
    page_token = None
    while True: 
        
        params={
        "query":f"from:{screen_name}",
        "tweet.fields":"entities,public_metrics,created_at",
        "pagination_token": page_token,
        "max_results":500, 
        "expansions":"author_id", 
        "start_time":start_time,
        "end_time":end_time
        }
        
        response = requests.get(search_tweets, params=params, headers=headers)
        
        if "title" in response.json().keys():
            if response.json()["status"] == 429: 
                print("Rate limit reached. waiting 15 minutes")
                time.sleep(900)
                continue
            if response.json()["status"] == 503: 
                print("Service overloaded. waiting 15 minutes")
                time.sleep(900)
                continue
            else: 
                print("Encountered unknown error pickling current results and stopping program")
                with open(f'{screen_name}_datafile.txt', 'wb') as fh:
                       pickle.dump(json_list, fh)
                        
        if response.json()["meta"]["result_count"] == 0: 
            break
        
        json_list.append(response.json())
        time.sleep(1)
                     
        try: 
            page_token = response.json()['meta']['next_token']
        except: 
            break
    return json_list


In [184]:
media_list = ["bbcnews", "itvnews", "Channel4News", "guardian", "MailOnline",
              "thetimes", "Telegraph", "TheSun", "SkyNews"]

In [185]:
for i in media_list: 
    
    results = get_tweets(i)

    df_list = []

    for j in results:
        df_list.append(pd.json_normalize(j["data"]))
    
    total_df = pd.concat(df_list, ignore_index = True)

    total_df.to_csv(f'../data/media_tweets/{i}.csv')
    total_df = None

Service overloaded. waiting 15 minutes
Service overloaded. waiting 15 minutes
Service overloaded. waiting 15 minutes


In [8]:
mp_df = pd.read_csv("../data/2019_MPs.csv")

In [14]:

for i in range(1, len(mp_df)):
    
    screen_name = mp_df.iloc[i]['twitter_username']
    if pd.isna(screen_name): continue
    start_time = mp_df.iloc[i]['start_time']
    end_time = mp_df.iloc[i]['end_time']
    results = get_tweets(screen_name, start_time, end_time)
    
    df_list = []

    for j in results:
        df_list.append(pd.json_normalize(j["data"]))
    if len(df_list) > 0: 
        total_df = pd.concat(df_list, ignore_index = True)
        total_df.to_csv(f'../data/mp_tweets/{screen_name}.csv')
    else: continue


Rate limit reached. waiting 15 minutes
Rate limit reached. waiting 15 minutes
Service overloaded. waiting 15 minutes
Rate limit reached. waiting 15 minutes
Service overloaded. waiting 15 minutes
