In [1]:
import pandas as pd
import requests
import json
import datetime
import time
import csv

In [9]:
# read user data:
user_data = pd.read_csv('../data/twitter/user_ids.csv')
twitter_handles = user_data['username'].tolist()

# read all articles and their matched tweet (if available; articles for which no tweet was found is also present in the dataset)
all_articles = pd.DataFrame(columns=[])
dtype={'text': str, 'author_id': str, 'conversation_id': str, 'id': str, 'entities': str, 'attachments': str, 'referenced_tweets': str, 'withheld': str}
parse_dates=['created_at']

for handle in twitter_handles:
    try:
        all_articles = all_articles.append(pd.read_csv(f'../data/twitter/article_tweets/{handle}.csv', dtype=dtype, parse_dates=parse_dates)).reset_index(drop=True)
    
    except FileNotFoundError:
        print(f'no file for outlet: {handle}')

no file for outlet: BoingBoing
no file for outlet: comicsandsdaily
no file for outlet: EveningTimesCC
no file for outlet: NewYorkSun


In [12]:
# remove entries for which 'id' is missing: these are articles for which no tweet was found:
article_tweets = all_articles.dropna(subset=['id'])

### Set up API

In [9]:
# define all necessary functions:
def connect_to_twitter(token):
    bearer_token = token
    return {"Authorization": "Bearer {}".format(bearer_token)}

def make_request(headers, params, url):
    url=url
    params=params
    return requests.request("GET", url, params=params, headers=headers).json()

def make_df(response):
    return pd.DataFrame(response['data'])

In [10]:
# read credentials:
#creds = pd.read_csv(f'../../creds/CredentialsAcademicAPI.csv') # read own credentials

# define bearer_token:
bearer_token = creds.iloc[0]['bearer_token']

In [11]:
# connect to API
headers = connect_to_twitter(bearer_token)

### Comments Count: collect comments that were commented within the first 60 days (2 months) after posting the tweet

In [15]:
# extract list of tweet ids:
tweet_ids = article_tweets['id'].tolist()
start_dates = article_tweets['created_at'].tolist()

# convert to pydatetime
for i in range (0, len(start_dates)):
    start_dates[i] = start_dates[i].to_pydatetime()

# calculate end date (start + 30 days)
end_dates = [] # empty list
cut_off = '2021-11-23 00:00:00+00:00' # cutoff date (date can't be in future)
for i in range (0, len(start_dates)):
    end_dates.append(start_dates[i] + datetime.timedelta(+60)) 
    if end_dates[i] > datetime.datetime.strptime(cut_off, '%Y-%m-%d %H:%M:%S%z'):
        end_dates[i] = datetime.datetime.strptime(cut_off, '%Y-%m-%d %H:%M:%S%z')

# convert both lists to strings:
start_dates = [str(i) for i in start_dates]
end_dates = [str(i) for i in end_dates]


In [16]:
# combine tweet_ids, start_dates and end_dates into one df:
df = pd.DataFrame({'tweet_ids': tweet_ids, 
                   'start_dates': start_dates, 
                   'end_dates': end_dates})


In [15]:
# define url
url="http://api.twitter.com/2/tweets/counts/all"

### Part 1

In [16]:
# create empty dict:
dic_comment_count={}

tweet_ids = df['tweet_ids'].tolist()
start_dates = df['start_dates'].tolist()
end_dates =df['end_dates'].tolist()


for tweet_id, start, end in zip(tweet_ids, start_dates, end_dates):
    params={'query': f'conversation_id:{tweet_id}',
            'start_time': datetime.datetime.strptime(start, '%Y-%m-%d %H:%M:%S%z').isoformat(),
            'end_time': datetime.datetime.strptime(end, '%Y-%m-%d %H:%M:%S%z').isoformat(),
            'granularity': 'day'}
    
    comment_count=make_request(headers, params, url) # make request
    time.sleep(4)
    df_comment_count=make_df(comment_count) # json to df
    
    if 'next_token' in  comment_count['meta']:
        while 'next_token' in  comment_count['meta']:
            params['next_token'] = comment_count['meta']['next_token']
            comment_count = make_request(headers, params, url) # make request
            time.sleep(4) # only 300 requests per 15 minutes 
            df_comment_count = df_comment_count.append(make_df(comment_count))
            
            if 'next_token' not in  comment_count['meta']:
                break
    
    dic_comment_count[f'{tweet_id}']=df_comment_count['tweet_count'].sum()

In [17]:
# dictionary to dataframe:
df_comment_count=pd.DataFrame(list(dic_comment_count.items()), columns=['id', 'nr_of_comments'])

# cast column 'id' as string
df_comment_count['id']=df_comment_count['id'].astype(str)

In [20]:
# save as csv: 
df_comment_count.to_csv('../data/twitter/comments_count.csv', index=None, header=True)

### Search relevant tweets (i.e., tweets with count > 0)

In [41]:
all_comments = df_comment_count

In [None]:
# remove tweets that have zero comments:
relevant_tweets = all_comments[all_comments['nr_of_comments'] != 0]

In [43]:
# add remaining information to tweet id:
df_relevant_tweets = relevant_tweets.merge(all_tweets, how='left', on='id')
df_relevant_tweets

Unnamed: 0,id,nr_of_comments,title,title_manipulated,bias_score,reliability_score,article_url,article_urls_manipulated,adfontes_url,date,...,unwound_urls_manipulated,entities,created_at,conversation_id,text,text_manipulated,author_id,referenced_tweets,attachments,withheld
0,1411877241647206401,2,"COVID Delta variant puts men, people of color ...",covid delta variant puts men people of color a...,-1.00,48.00,https://19thnews.org/2021/07/the-covid-delta-v...,19thnews.org/2021/07/the-covid-delta-variant-p...,https://adfontesmedia.com/19th-news-bias-and-r...,2021-07-02,...,,"{'annotations': [{'start': 96, 'end': 108, 'pr...",2021-07-05 02:39:00+00:00,1411877241647206401,The highly contagious Delta variant of COVID-1...,the highly contagious delta variant of covid-1...,1219278784693768193,,,
1,1410951027017256965,4,Women in the Biden White House earn 99 cents f...,women in the biden white house earn 99 cents f...,-11.00,46.67,https://19thnews.org/2021/07/women-in-the-bide...,19thnews.org/2021/07/women-in-the-biden-white-...,https://adfontesmedia.com/19th-news-bias-and-r...,2021-07-01,...,,"{'annotations': [{'start': 13, 'end': 17, 'pro...",2021-07-02 13:18:33+00:00,1410951027017256965,Women in the Biden administration earn 99 cent...,women in the biden administration earn 99 cent...,1219278784693768193,,,
2,1410713592425598980,13,Women in the Biden White House earn 99 cents f...,women in the biden white house earn 99 cents f...,-11.00,46.67,https://19thnews.org/2021/07/women-in-the-bide...,19thnews.org/2021/07/women-in-the-biden-white-...,https://adfontesmedia.com/19th-news-bias-and-r...,2021-07-01,...,,"{'annotations': [{'start': 13, 'end': 29, 'pro...",2021-07-01 21:35:04+00:00,1410713592425598980,Women in the Biden White House earn 99 cents f...,women in the biden white house earn 99 cents f...,1219278784693768193,,,
3,1410679067943256064,7,Kagan warns Supreme Court has weakened voting ...,kagan warns supreme court has weakened voting ...,-12.00,45.67,https://19thnews.org/2021/07/kagan-dissent-brn...,19thnews.org/2021/07/kagan-dissent-brnovich-dn...,https://adfontesmedia.com/19th-news-bias-and-r...,2021-07-01,...,,"{'annotations': [{'start': 8, 'end': 20, 'prob...",2021-07-01 19:17:53+00:00,1410679067943256064,🧵: The Supreme Court’s decision to make it eas...,🧵 the supreme court’s decision to make it easi...,1219278784693768193,,,
4,1408228472678453248,1,Fate of infrastructure deal hinges on approvin...,fate of infrastructure deal hinges on approvin...,-4.33,46.00,https://19thnews.org/2021/06/american-families...,19thnews.org/2021/06/american-families-plan-in...,https://adfontesmedia.com/19th-news-bias-and-r...,2021-06-24,...,,"{'annotations': [{'start': 0, 'end': 13, 'prob...",2021-06-25 01:00:06+00:00,1408228472678453248,Speaker Pelosi says the House will not vote on...,speaker pelosi says the house will not vote on...,1219278784693768193,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3952,1275866456819982343,4,Flynn Dismissal Order 'Thoroughly Demolishes' ...,flynn dismissal order thoroughly demolishes di...,22.00,27.00,https://www.zerohedge.com/political/appeals-co...,zerohedge.com/political/appeals-court-orders-f...,https://adfontesmedia.com/zerohedge-bias-and-r...,2020-06-25,...,,"{'urls': [{'start': 84, 'end': 107, 'url': 'ht...",2020-06-24 19:00:42+00:00,1275866456819982343,Court-Appointed 'Hit-Judge' Scrambles To File ...,court-appointed hit-judge scrambles to file fl...,18856867,,,
3953,1275794084569976840,17,Flynn Dismissal Order 'Thoroughly Demolishes' ...,flynn dismissal order thoroughly demolishes di...,22.00,27.00,https://www.zerohedge.com/political/appeals-co...,zerohedge.com/political/appeals-court-orders-f...,https://adfontesmedia.com/zerohedge-bias-and-r...,2020-06-25,...,,"{'urls': [{'start': 49, 'end': 72, 'url': 'htt...",2020-06-24 14:13:07+00:00,1275794084569976840,Appeals Court Orders Flynn Judge To Dismiss Ca...,appeals court orders flynn judge to dismiss ca...,18856867,,,
3954,1272583576438083584,23,"US Surgeon General Flip-Flops From ""Stop Buyin...",us surgeon general flip-flops from stop buying...,8.00,33.67,https://www.zerohedge.com/political/us-surgeon...,zerohedge.com/political/us-surgeon-general-fli...,https://adfontesmedia.com/zerohedge-bias-and-r...,2020-06-15,...,,"{'urls': [{'start': 91, 'end': 114, 'url': 'ht...",2020-06-15 17:35:42+00:00,1272583576438083584,"US Surgeon General Flip-Flops From ""Stop Buyin...",us surgeon general flip-flops from stop buying...,18856867,,,
3955,1272593722992140290,17,Morgan Stanley Turns Even More Bullish: Hikes ...,morgan stanley turns even more bullish hikes s...,8.33,41.00,https://www.zerohedge.com/markets/morgan-stanl...,zerohedge.com/markets/morgan-stanley-turns-eve...,https://adfontesmedia.com/zerohedge-bias-and-r...,2020-06-15,...,,"{'annotations': [{'start': 0, 'end': 13, 'prob...",2020-06-15 18:16:01+00:00,1272593722992140290,Morgan Stanley Turns Even More Bullish: Hikes ...,morgan stanley turns even more bullish hikes s...,18856867,,,


In [51]:
# save to csv:
df_relevant_tweets.to_csv('../data/twitter/relevant_tweets.csv', index=None, header=True)

### Collecting comments to tweets

In [52]:
# extract list of tweet ids & created_at (when tweet was created)
tweet_ids = df_relevant_tweets['id'].tolist()
start_dates = df_relevant_tweets['created_at'].tolist()

# convert to pydatetime
for i in range (0, len(start_dates)):
    start_dates[i] = start_dates[i].to_pydatetime()

# calculate end date (start + 60 days)
end_dates = [] # empty list
cut_off = '2021-11-23 00:00:00+00:00' # cutoff date (date can't be in future)
for i in range (0, len(start_dates)):
    end_dates.append(start_dates[i] + datetime.timedelta(+60)) 
    if end_dates[i] > datetime.datetime.strptime(cut_off, '%Y-%m-%d %H:%M:%S%z'):
        end_dates[i] = datetime.datetime.strptime(cut_off, '%Y-%m-%d %H:%M:%S%z')

# convert both lists to strings:
start_dates = [str(i) for i in start_dates]
end_dates = [str(i) for i in end_dates]

In [53]:
# define url:
url="http://api.twitter.com/2/tweets/search/all"

In [54]:
# define params
for tweet_id, start, end in zip(tweet_ids, start_dates, end_dates):
    params={'query': f'conversation_id:{tweet_id}', # add Twitter handle for outlets here
            'start_time': datetime.datetime.strptime(start, '%Y-%m-%d %H:%M:%S%z').isoformat(),
            'end_time': datetime.datetime.strptime(end, '%Y-%m-%d %H:%M:%S%z').isoformat(),
            'tweet.fields': 'in_reply_to_user_id,author_id,created_at,conversation_id', 
            'expansions': 'referenced_tweets.id,in_reply_to_user_id',
            'max_results': 500} # with academic access: 500 results per response
    
    response=make_request(headers, params, url)
    time.sleep(4)
    
    if response['meta']['result_count'] > 0:
        response_df = make_df(response)
        
        if 'next_token' in  response['meta']:
            while 'next_token' in  response['meta']:
                params['next_token'] = response['meta']['next_token']
                response = make_request(headers, params, url)
                time.sleep(4) 
                
                if response['meta']['result_count'] > 0:
                    response_df = response_df.append(make_df(response))
                    
                if 'next_token' not in  response['meta']:
                    break
    
        response_df.to_csv(f'../data/twitter/comment_collection/{tweet_id}_comments.csv', index=False)