In [1]:
import pandas as pd
import requests
import json
import datetime
import time
import csv

In [2]:
# read user data:
user_data = pd.read_csv('../data/twitter/user_ids.csv')
twitter_handles = user_data['username'].tolist()

# read all articles and their matched tweet (if available; articles for which no tweet was found is also present in the dataset)
all_articles = pd.DataFrame(columns=[])
dtype={'text': str, 'author_id': str, 'conversation_id': str, 'id': str, 'entities': str, 'attachments': str, 'referenced_tweets': str, 'withheld': str}
parse_dates=['created_at']

for handle in twitter_handles:
    try:
        all_articles = all_articles.append(pd.read_csv(f'../data/twitter/article_tweets/{handle}.csv', dtype=dtype, parse_dates=parse_dates)).reset_index(drop=True)
    
    except FileNotFoundError:
        print(f'no file for outlet: {handle}')

no file for outlet: BoingBoing
no file for outlet: comicsandsdaily
no file for outlet: EveningTimesCC
no file for outlet: NewYorkSun


In [3]:
# remove entries for which 'id' is missing: these are articles for which no tweet was found:
article_tweets = all_articles.dropna(subset=['id'])

### Set up API

In [4]:
# define all necessary functions:
def connect_to_twitter(token):
    bearer_token = token
    return {"Authorization": "Bearer {}".format(bearer_token)}

def make_request(headers, params, url):
    url=url
    params=params
    return requests.request("GET", url, params=params, headers=headers).json()

def make_df(response):
    return pd.DataFrame(response['data'])

In [5]:
# read credentials:
#creds = pd.read_csv(f'../../creds/CredentialsAcademicAPI.csv') # read own credentials

# define bearer_token:
bearer_token = creds.iloc[0]['bearer_token']

In [6]:
# connect to API
headers = connect_to_twitter(bearer_token)




### Collecting quoted retweets to tweets

In [7]:
# extract list of tweet ids & created_at (when tweet was created)
tweet_ids = article_tweets['id'].tolist()
start_dates = article_tweets['created_at'].tolist()

# convert to pydatetime
for i in range (0, len(start_dates)):
    start_dates[i] = start_dates[i].to_pydatetime()

# calculate end date (start + 60 days)
end_dates = [] # empty list
cut_off = '2021-11-23 00:00:00+00:00' # cutoff date (date can't be in future)
for i in range (0, len(start_dates)):
    end_dates.append(start_dates[i] + datetime.timedelta(+60)) 
    if end_dates[i] > datetime.datetime.strptime(cut_off, '%Y-%m-%d %H:%M:%S%z'):
        end_dates[i] = datetime.datetime.strptime(cut_off, '%Y-%m-%d %H:%M:%S%z')

# convert both lists to strings:
start_dates = [str(i) for i in start_dates]
end_dates = [str(i) for i in end_dates]

In [8]:
# define url
url="http://api.twitter.com/2/tweets/search/all"

In [9]:
# define params
for tweet_id, start, end in zip(tweet_ids, start_dates, end_dates):
    params={'query': f'{tweet_id}',
            'start_time': datetime.datetime.strptime(start, '%Y-%m-%d %H:%M:%S%z').isoformat(),
            'end_time': datetime.datetime.strptime(end, '%Y-%m-%d %H:%M:%S%z').isoformat(),
            'tweet.fields': 'in_reply_to_user_id,author_id,created_at,conversation_id', 
            'expansions': 'referenced_tweets.id,in_reply_to_user_id',
            'max_results': 500} # with academic access: 500 results per response
    
    response=make_request(headers, params, url)
    time.sleep(4)
    
    if response['meta']['result_count'] > 0:
        response_df = make_df(response)
        
        if 'next_token' in  response['meta']:
            while 'next_token' in  response['meta']:
                params['next_token'] = response['meta']['next_token']
                response = make_request(headers, params, url)
                time.sleep(4) 
                
                if response['meta']['result_count'] > 0:
                    response_df = response_df.append(make_df(response))
                    
                if 'next_token' not in  response['meta']:
                    break
    
        response_df.to_csv(f'../data/twitter/retweet_collection/{tweet_id}_retweet.csv', index=False)