In [88]:
import pandas as pd
import re
import ast

In [89]:
user_data = pd.read_csv('../data/twitter/user_ids.csv') # df with user name and user id of the outlets
twitter_handles = user_data['username'].tolist()
user_ids = user_data['author_id'].tolist()

#### all matched tweets: all tweets found that refer to a rated article

In [90]:
# create df with empty columns
matched_tweets_all = pd.DataFrame(columns=[])

# read all files and add to all_outlets
dtype={'text': str, 'author_id': str, 'conversation_id': str, 'id': str, 'entities': str, 'attachments': str, 'referenced_tweets': str, 'withheld': str}
parse_dates=['created_at']

for handle in twitter_handles:
    try:
        matched_tweets_all = matched_tweets_all.append(pd.read_csv(f'../data/twitter/article_tweets/{handle}.csv', dtype=dtype, parse_dates=parse_dates)).reset_index(drop=True)
    
    except FileNotFoundError:
        print(f'no articles found for {handle}')

no articles found for BoingBoing
no articles found for comicsandsdaily
no articles found for EveningTimesCC
no articles found for NewYorkSun


In [91]:
# save to csv: 
#matched_tweets_all.to_csv('data/matched_tweets_all.csv', index=None, header=True

#### all relevant tweets: tweets that have at least 1 comment

In [92]:
# read file with relevant tweets:
dtype={'text': str, 'author_id': str, 'conversation_id': str, 'id': str, 'entities': str, 'attachments': str, 'referenced_tweets': str, 'withheld': str}
parse_dates=['created_at']

relevant_tweets_all = pd.read_csv('../data/twitter/relevant_tweets.csv', dtype=dtype, parse_dates=parse_dates)

relevant_tweets_ids = relevant_tweets_all['id'].tolist()
print(f'number of tweets that have at least one comment: {len(relevant_tweets_ids)}')

number of tweets that have at least one comment: 3957


#### all comments: all comments to all relevant tweets

In [93]:
# read all comments & collapse in one df:
# create df with empty columns
comments_all = pd.DataFrame(columns=[])

# read all files and add to all_outlets
dtype={'text': str, 'author_id': str, 'conversation_id': str, 'id': str, 'entities': str, 'attachments': str, 'referenced_tweets': str, 'withheld': str}
parse_dates=['created_at']

for tweet_id in relevant_tweets_ids:
    try:
        comments_all = comments_all.append(pd.read_csv(f'..data/twitter/comment_collection/{tweet_id}_comments.csv', dtype=dtype, parse_dates=parse_dates)).reset_index(drop=True)
    
    except FileNotFoundError:
        continue

In [94]:
# save to csv: 
#comments_all.to_csv('../data/twitter/comments_all.csv', index=None, header=True)

#### all retweets: all retweets to the relevant tweets:

In [95]:
# read all retweets & collapse in one df:
# remove rows from matched_tweets_all for which no match was found >> for these tweets retweets were searched
matched_tweets = matched_tweets_all.dropna(subset=['id'])
matched_tweets_ids = matched_tweets['id'].tolist()

# create df with empty columns
retweets_all = pd.DataFrame(columns=[])

# read all files and add to all_outlets
dtype={'text': str, 'author_id': str, 'conversation_id': str, 'id': str, 'referenced_tweets': str}
parse_dates=['created_at']

for tweet_id in matched_tweets_ids:
    try:
        retweets_all = retweets_all.append(pd.read_csv(f'../data/twitter/retweet_collection/{tweet_id}_retweet.csv', dtype=dtype, parse_dates=parse_dates)).reset_index(drop=True)
    
    except FileNotFoundError:
        continue

In [96]:
# keep only entries where type = quoted; if type = retweet it is a retweet of the quoted retweet; I'm not interested in simple retweets
def replace_brackets(text):
    text = re.sub(r'^\[|\]$', '', text)
    return text

def split_string(text):
    text = re.split("(?<=\}),\s", text)
    return text

In [97]:
# remove rows that have missing values at 'referenced_tweets' (can't  be assigned to 'original' tweet w/o that field)
quoted_retweets_all = retweets_all.dropna(subset=['referenced_tweets']).reset_index(drop=True)

quoted_retweets_all['referenced_tweets'] = quoted_retweets_all['referenced_tweets'].astype(str) # cast column as type string
quoted_retweets_all['referenced_tweets'] = quoted_retweets_all['referenced_tweets'].apply(replace_brackets) # remove squared brackets at beginning & end
quoted_retweets_all['referenced_tweets'] = quoted_retweets_all['referenced_tweets'].apply(split_string) # split spring in list of 1 or 2 entries

In [98]:
# extract column 'referenced_tweets' as a list:
referenced_tweets = quoted_retweets_all['referenced_tweets'].tolist()

# collect types (quoted or retweet) in new list:
types = []
for i in range (0, len(referenced_tweets)):
    types.append(referenced_tweets[i][0])

# convert entries of list to dictionaries:
for i in range (0, len(types)):
    types[i] = ast.literal_eval(types[i])

In [99]:
# list types to dataframe types:
types_df = pd.DataFrame(data=types)
types_df = types_df.rename(columns={'id': 'tweet_id'}) # rename column (from 'id' to 'tweet_id')

In [100]:
# add type columns to df: 
quoted_retweets_all = quoted_retweets_all.join(types_df)

In [101]:
# keep only those of type 'quoted'
quoted_retweets_all = quoted_retweets_all[quoted['type'] == 'quoted']

# save to csv: 
#quoted_retweets_all.to_csv('../data/twitter/quoted_retweets_all.csv', header=True, index=None)

### Combine all collected tweets into one DF:

#### 1. matched_tweets_all: contains all rated articles and (if exists) the corresponding tweet(s)
#### 2. comments_all: contains all the comments belonging to the matched tweets
#### 3. retweets_all: contains all the quoted retweets belonging to the matched tweets

In [102]:
# prepare matched_tweets_all for join: 
drop = ['text', 'title_manipulated', 'article_urls_manipulated', 'date', 'expanded_urls', 'expanded_urls_manipulated', 'unwound_urls', 'unwound_urls_manipulated', 'entities', 'created_at', 'conversation_id', 'text_manipulated', 'author_id', 'referenced_tweets', 'attachments', 'withheld']
matched_tweets_all = matched_tweets_all.drop(columns=drop)
matched_tweets_all

Unnamed: 0,title,bias_score,reliability_score,article_url,adfontes_url,outlet,twitter_handle,id
0,"COVID Delta variant puts men, people of color ...",-1.00,48.00,https://19thnews.org/2021/07/the-covid-delta-v...,https://adfontesmedia.com/19th-news-bias-and-r...,19th News,19thnews,1411877241647206401
1,"Two states have killed the ‘tampon tax,' but a...",-9.33,46.67,https://19thnews.org/2021/07/two-states-have-k...,https://adfontesmedia.com/19th-news-bias-and-r...,19th News,19thnews,1410728425459896324
2,Women in the Biden White House earn 99 cents f...,-11.00,46.67,https://19thnews.org/2021/07/women-in-the-bide...,https://adfontesmedia.com/19th-news-bias-and-r...,19th News,19thnews,1410951027017256965
3,Women in the Biden White House earn 99 cents f...,-11.00,46.67,https://19thnews.org/2021/07/women-in-the-bide...,https://adfontesmedia.com/19th-news-bias-and-r...,19th News,19thnews,1410713592425598980
4,Kagan warns Supreme Court has weakened voting ...,-12.00,45.67,https://19thnews.org/2021/07/kagan-dissent-brn...,https://adfontesmedia.com/19th-news-bias-and-r...,19th News,19thnews,1410680098039074817
...,...,...,...,...,...,...,...,...
9856,The Cowboy State Is Hurting As Low Oil Prices ...,6.00,39.00,https://www.zerohedge.com/energy/cowboy-state-...,https://adfontesmedia.com/zerohedge-bias-and-r...,ZeroHedge,zerohedge,1272588789756633098
9857,Morgan Stanley Turns Even More Bullish: Hikes ...,8.33,41.00,https://www.zerohedge.com/markets/morgan-stanl...,https://adfontesmedia.com/zerohedge-bias-and-r...,ZeroHedge,zerohedge,1272593722992140290
9858,The Pandemic Moonshot: Printing Money Until Th...,12.50,36.00,https://www.zerohedge.com/markets/pandemic-moo...,https://adfontesmedia.com/zerohedge-bias-and-r...,ZeroHedge,zerohedge,1272598504523755521
9859,"Don't Get Distracted by Dividends, Focus on Value",0.67,37.00,https://www.zerohedge.com/news/2021-04-30/dont...,https://adfontesmedia.com/zerohedge-bias-and-r...,ZeroHedge,zerohedge,


In [103]:
# prepare comments_all for join:
comments_all.dtypes
drop = ['author_id', 'in_reply_to_user_id', 'created_at', 'referenced_tweets', 'withheld']
comments_all = comments_all.drop(columns=drop)

122743


In [104]:
comments_all = comments_all.drop_duplicates(ignore_index=True)

122740


In [105]:
# prepare retweets_all for join:
drop = ['created_at', 'referenced_tweets', 'author_id', 'conversation_id', 'in_reply_to_user_id', 'withheld', 'type']
quoted_retweets_all = quoted_retweets_all.drop(columns=drop) 

53140


In [106]:
quoted_retweets_all = quoted_retweets_all.drop_duplicates(ignore_index=True)

53138


In [107]:
# join matched_tweets_all with comments_all:
join_comments = matched_tweets_all.merge(comments_all, how='inner', left_on='id', right_on='conversation_id', suffixes=('_left', '_right'))
join_comments = join_comments.rename(columns={'conversation_id': 'tweet_id', 'id_right': 'id'})
join_comments = join_comments.drop(columns=['id_left'])

# join matched_tweets_all with quoted:
join_retweets = matched_tweets_all.merge(quoted_retweets_all, how='inner', left_on='id', right_on='tweet_id', suffixes=('_left', '_right'))
join_retweets = join_retweets.rename(columns={'id_right': 'id'})
join_retweets = join_retweets.drop(columns=['id_left'])

In [108]:
# concatenate both dataframes: 
all_tweets = pd.concat([join_comments, join_retweets], axis=0)

# reorder columns:
all_tweets = all_tweets[['id', 'text', 'tweet_id', 'title', 'outlet', 'twitter_handle', 'article_url', 'adfontes_url', 'bias_score', 'reliability_score']]

# sort df: 
all_tweets = all_tweets.sort_values(by='outlet', key=lambda col: col.str.lower()).reset_index(drop=True)

In [109]:
all_tweets = all_tweets.drop_duplicates(ignore_index=True)

In [111]:
# save to csv:
#all_tweets.to_csv('../data/twitter/all_tweets_final.csv', header=True, index=None)