In [1]:
import pandas as pd
import requests
import json
import datetime
import time

### Set up:

In [2]:
# define all necessary functions:
def connect_to_twitter(token):
    bearer_token = token
    return {"Authorization": "Bearer {}".format(bearer_token)}

def make_request(headers, params, url):
    url=url
    params=params
    return requests.request("GET", url, params=params, headers=headers).json()

def make_df(response):
    return pd.DataFrame(response['data'])


In [3]:
# read credentials:
# creds = pd.read_csv(f'../../creds/CredentialsAcademicAPI.csv') # read own credentials

# define bearer_token:
bearer_token = creds.iloc[0]['bearer_token']

In [4]:
# connect to API
headers = connect_to_twitter(bearer_token)

In [5]:
# read data:
meta_data = pd.read_csv(f'../data/twitter/time_stamps_all.csv')

### 1. Collect Twitter IDs of all Outlets 

In [6]:
# define url:
url="https://api.twitter.com/2/users/by"

# define params: 
all_handles = meta_data['twitter_handle'].tolist()

In [None]:
# create empty dict:
users={}


for handle in all_handles:
    params={'usernames': f'{handle}'} # add Twitter handle for outlets here
    
    users[f'{handle}']=make_request(headers, params, url)

In [7]:
# dict to dataframe:
df_ids = pd.DataFrame(list(users.items()))

In [8]:
# extract user ids for each outlet from dictionary column:
user_ids=[]
for i in range(0, len(df_ids)):
    user_ids.append(df_ids.iloc[i,1]['data'][0]['id'])

In [9]:
df_ids[1]=user_ids
df_ids=df_ids.rename(columns={0: "username", 1: "author_id"})

In [61]:
# save to csv:
#df_ids.to_csv(f'../data/twitter/user_ids.csv', header=True, index=False)
# when reading csv, id column is read as int; df_ids['author_id'].astype(str) to convert to string

### 2. Get Tweets Count

In [352]:
# define url
url="http://api.twitter.com/2/tweets/counts/all"

# define params: 
#all_handles = meta_data['twitter_handle'].tolist() # already defined above
start_dates = meta_data['start_dates'].tolist()
end_dates = meta_data['end_dates'].tolist()

In [353]:
# create empty dict:
dict_tweet_count={}


for handle, start, end in zip(all_handles, start_dates, end_dates):
    params={'query': f'from:{handle}', # add Twitter handle for outlets here
            'start_time': datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z').isoformat(), # german time => created at 2021-10-23T22:00:00.000Z means 2021-10-24T00:00:00 GERMAN TIME 
            'end_time':datetime.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z').isoformat(),
            'granularity': 'day'}
    
    tweet_count=make_request(headers, params, url)
    df_tweet_count=make_df(tweet_count)
    
    if 'next_token' in  tweet_count['meta']:
        while 'next_token' in  tweet_count['meta']:
            params['next_token'] = tweet_count['meta']['next_token']
            tweet_count=make_request(headers, params, url)
            df_tweet_count=df_tweet_count.append(make_df(tweet_count))
            time.sleep(4) # only 300 requests per 15 minutes
            
            if 'next_token' not in  tweet_count['meta']:
                break
    
    dict_tweet_count[f'{handle}']=df_tweet_count['tweet_count'].sum()

In [354]:
tweet_counts_all=pd.DataFrame(list(dict_tweet_count.items()))

In [356]:
tweet_counts_all=tweet_counts_all.rename(columns={0: "username", 1: "tweet_count"})

In [36]:
#tweet_counts_all.to_csv(f'../data/twitter/tweet_counts.csv', header=True, index=False)

### 3. Search Tweets

In [278]:
# remove outlets with zero tweets (@comicsandsdaily, @EveningTimesCC, @NewYorkSun)
#meta_data

#zero_tweets = ['comicsandsdaily', 'EveningTimesCC', 'NewYorkSun']
#meta_data_filtered = meta_data[~meta_data['twitter_handle'].isin(zero_tweets)]

In [346]:
# split meta_data df, test code on few outlets, then run code for first half, for second half later: 
meta_data_test = meta_data.loc[0:3] # to test run code for first 4 outlets
meta_data_1 = meta_data.loc[4:140] # top half
meta_data_2 = meta_data.loc[141:282] #bottom half

In [308]:
# define url:
url="http://api.twitter.com/2/tweets/search/all"

#### Test run (19thnews - ajc)

In [309]:
# define params:
twitter_handles = meta_data_test['twitter_handle'].tolist()
start_dates = meta_data_test['start_dates'].tolist()
end_dates = meta_data_test['end_dates'].tolist()

In [281]:
# define request parameters
for handle, start, end in zip(twitter_handles, start_dates, end_dates):
    params={'query': f'from:{handle}', # add Twitter handle for outlets here
            'start_time': datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z').isoformat(), # german time => created at 2021-10-23T22:00:00.000Z means 2021-10-24T00:00:00 GERMAN TIME 
            'end_time': datetime.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z').isoformat(), 
            'tweet.fields': 'author_id,entities,attachments,conversation_id,created_at,referenced_tweets',
            'max_results': 500} # default is 10, max possible is 100
    
    response=make_request(headers, params, url)
    time.sleep(4)
    
    if response['meta']['result_count'] != 0:
        response_df_outlets=make_df(response)
        
        if 'next_token' in  response['meta']:
            while 'next_token' in  response['meta']:
                params['next_token'] = response['meta']['next_token']
                response=make_request(headers, params, url)
                time.sleep(4)
                
                if response['meta']['result_count'] != 0:
                    response_df_outlets=response_df_outlets.append(make_df(response))
                    
                if 'next_token' not in  response['meta']:
                    break
    
        response_df_outlets.to_csv(f'../data/twitter/tweet_collection/{handle}.csv', index=False)

#### First run (AJEnglish - NationalFile)

In [324]:
# define params:
twitter_handles = meta_data_1['twitter_handle'].tolist()
start_dates = meta_data_1['start_dates'].tolist()
end_dates = meta_data_1['end_dates'].tolist()

In [325]:
# define request parameters
for handle, start, end in zip(twitter_handles, start_dates, end_dates):
    params={'query': f'from:{handle}', # add Twitter handle for outlets here
            'start_time': datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z').isoformat(), # german time => created at 2021-10-23T22:00:00.000Z means 2021-10-24T00:00:00 GERMAN TIME 
            'end_time': datetime.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z').isoformat(), 
            'tweet.fields': 'author_id,entities,attachments,conversation_id,created_at,referenced_tweets',
            'max_results': 500} # default is 10, max possible is 100
    
    response=make_request(headers, params, url)
    time.sleep(4)
    
    if 'title' in response:
        time.sleep(901)
        response=make_request(headers, params, url)
        time.sleep(4)
    
    if response['meta']['result_count'] != 0:
        response_df_outlets=make_df(response)
        
        if 'next_token' in  response['meta']:
            while 'next_token' in  response['meta']:
                params['next_token'] = response['meta']['next_token']
                response=make_request(headers, params, url)
                time.sleep(4)
                
                if response['meta']['result_count'] != 0:
                    response_df_outlets=response_df_outlets.append(make_df(response))
                    
                if 'next_token' not in  response['meta']:
                    break
    
        response_df_outlets.to_csv(f'../data/twitter/tweet_collection/{handle}.csv', index=False)

KeyError: 'meta'

In [337]:
# 'Too Many Requests error'; continue with @FDRLST
# redefine params:

meta_data_1v2 = meta_data_1.loc[84:140]

twitter_handles = meta_data_1v2['twitter_handle'].tolist()
start_dates = meta_data_1v2['start_dates'].tolist()
end_dates = meta_data_1v2['end_dates'].tolist()

In [338]:
# define request parameters
for handle, start, end in zip(twitter_handles, start_dates, end_dates):
    params={'query': f'from:{handle}', # add Twitter handle for outlets here
            'start_time': datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z').isoformat(), # german time => created at 2021-10-23T22:00:00.000Z means 2021-10-24T00:00:00 GERMAN TIME 
            'end_time': datetime.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z').isoformat(), 
            'tweet.fields': 'author_id,entities,attachments,conversation_id,created_at,referenced_tweets',
            'max_results': 500} # default is 10, max possible is 100
    
    response=make_request(headers, params, url)
    time.sleep(4)
    
    if 'title' in response:
        time.sleep(901)
        response=make_request(headers, params, url)
        time.sleep(4)
    
    if response['meta']['result_count'] != 0:
        response_df_outlets=make_df(response)
        
        if 'next_token' in  response['meta']:
            while 'next_token' in  response['meta']:
                params['next_token'] = response['meta']['next_token']
                response=make_request(headers, params, url)
                time.sleep(4)
                
                if response['meta']['result_count'] != 0:
                    response_df_outlets=response_df_outlets.append(make_df(response))
                    
                if 'next_token' not in  response['meta']:
                    break
    
        response_df_outlets.to_csv(f'../data/twitter/tweet_collection/{handle}.csv', index=False)

ConnectionError: ('Connection aborted.', ConnectionAbortedError(10053, 'Eine bestehende Verbindung wurde softwaregesteuert\r\ndurch den Hostcomputer abgebrochen', None, 10053, None))

In [342]:
# 'Connection aborted'; continue with @Mediaite
# redefine params:

meta_data_1v3 = meta_data_1.loc[129:140]

twitter_handles = meta_data_1v3['twitter_handle'].tolist()
start_dates = meta_data_1v3['start_dates'].tolist()
end_dates = meta_data_1v3['end_dates'].tolist()

In [343]:
# define request parameters
for handle, start, end in zip(twitter_handles, start_dates, end_dates):
    params={'query': f'from:{handle}', # add Twitter handle for outlets here
            'start_time': datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z').isoformat(), # german time => created at 2021-10-23T22:00:00.000Z means 2021-10-24T00:00:00 GERMAN TIME 
            'end_time': datetime.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z').isoformat(), 
            'tweet.fields': 'author_id,entities,attachments,conversation_id,created_at,referenced_tweets',
            'max_results': 500} # default is 10, max possible is 100
    
    response=make_request(headers, params, url)
    time.sleep(4)
    
    if 'title' in response:
        time.sleep(901)
        response=make_request(headers, params, url)
        time.sleep(4)
    
    if response['meta']['result_count'] != 0:
        response_df_outlets=make_df(response)
        
        if 'next_token' in  response['meta']:
            while 'next_token' in  response['meta']:
                params['next_token'] = response['meta']['next_token']
                response=make_request(headers, params, url)
                time.sleep(4)
                
                if response['meta']['result_count'] != 0:
                    response_df_outlets=response_df_outlets.append(make_df(response))
                    
                if 'next_token' not in  response['meta']:
                    break
    
        response_df_outlets.to_csv(f'../data/twitter/tweet_collection/{handle}.csv', index=False)

#### Second run (NBCNews - zerohedge)

In [348]:
# define params:
twitter_handles = meta_data_2['twitter_handle'].tolist()
start_dates = meta_data_2['start_dates'].tolist()
end_dates = meta_data_2['end_dates'].tolist()

In [351]:
# define request parameters
for handle, start, end in zip(twitter_handles, start_dates, end_dates):
    params={'query': f'from:{handle}', # add Twitter handle for outlets here
            'start_time': datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z').isoformat(), # german time => created at 2021-10-23T22:00:00.000Z means 2021-10-24T00:00:00 GERMAN TIME 
            'end_time': datetime.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z').isoformat(), 
            'tweet.fields': 'author_id,entities,attachments,conversation_id,created_at,referenced_tweets',
            'max_results': 500} # default is 10, max possible is 100
    
    response=make_request(headers, params, url)
    time.sleep(4)
    
    if 'title' in response:
        time.sleep(901)
        response=make_request(headers, params, url)
        time.sleep(4)
    
    if response['meta']['result_count'] != 0:
        response_df_outlets=make_df(response)
        
        if 'next_token' in  response['meta']:
            while 'next_token' in  response['meta']:
                params['next_token'] = response['meta']['next_token']
                response=make_request(headers, params, url)
                time.sleep(4)
                
                if response['meta']['result_count'] != 0:
                    response_df_outlets=response_df_outlets.append(make_df(response))
                    
                if 'next_token' not in  response['meta']:
                    break
    
        response_df_outlets.to_csv(f'../data/twitter/tweet_collection/{handle}.csv', index=False)