In [1]:
#import libraries
import pandas as pd
from pathlib import Path

## Importing Data & Making Dataframes
The code below was run on a subset of data from January 31st, 2020 for the hour of 21:00:00. The tweet ids have been retreived and hydrated from https://github.com/echen102/COVID-19-TweetIDs.

In [3]:
#access the data_samples folder to get the json tweet data for the 21:00:00 hour on January 31st, 2020
#save the json file as a data frame
data_dir = Path('../..') / 'data_samples/json_files/may_sample'
json_file = str(data_dir) + '/SAMPLE-coronavirus-tweet-id-2020-05-01-00.json'
df = pd.read_json(json_file, lines=True)
#filter the rows to be only tweets that are in English
en_df = df[df['lang'] == 'en']
en_df = en_df.reset_index()

## Find how many times a link was tweeted
This can be done for either the display_url or expanded_url. You will recieve different results depending on which URL you use. We focused on the expanded_url to narrow down the exact article and claim being tweeted. This prototype works better with a larger dataset - this prototype uses a small sample.

In [4]:
#getting any url links from all tweets in the dataframe
tweet_ext_URLs = {}

for i in range(len(en_df)):
    #Filter out tweets that don't have a URL
    if len(en_df.loc[i]['entities']['urls']) == 0:
        continue
    else:
        #Get the extended url
        expURL = en_df.loc[i]['entities']['urls'][0]['expanded_url']

        #If a URL starts with 'https://twitter.com/' it is identifying a retweet. For this prototype,
        #we did not look at retweets.
        if expURL.startswith('https://twitter.com/'):
            continue
        else: 
            #Check if the dictionary already contains that links - if it does increase its counter
            if expURL in tweet_ext_URLs:
                tweet_ext_URLs[expURL] += 1
            else:
                tweet_ext_URLs[expURL] = 1
      

In [5]:
#Turn tweet_ext_urls into dataframes then sort by counts and view the top 10 links.
columns = ['urlext', 'count']
tweet_ext_df = pd.DataFrame(tweet_ext_URLs.items(), columns=columns)
tweet_ext_df = tweet_ext_df.sort_values('count', ascending = False)
filtered_df = tweet_ext_df.head(10)
filtered_df

Unnamed: 0,urlext,count
0,https://apnews.com/490aee062b36ab64c76c624f967...,1
1,https://bit.ly/3bTINlI,1
2,https://bit.ly/3d0UMxR,1
3,https://www.zerohedge.com/geopolitical/trump-w...,1
4,https://paper.li/e-1497917245?edition_id=5b66e...,1
5,https://techcrunch.com/2020/04/20/can-employer...,1
6,http://www.borderlandbeat.com/2020/04/wuhan-wa...,1
7,http://dlvr.it/RVnCp0,1
8,https://thehardtimes.net/culture/sesame-street...,1
9,https://inspirationbylorettaexclusive.com/prod...,1


In [6]:
#To view the full clickable link
for url in filtered_df['urlext']:
    print(url)

https://apnews.com/490aee062b36ab64c76c624f9674a89c
https://bit.ly/3bTINlI
https://bit.ly/3d0UMxR
https://www.zerohedge.com/geopolitical/trump-wants-us-troops-immediately-out-afghanistan-due-coronavirus
https://paper.li/e-1497917245?edition_id=5b66e820-8b40-11ea-8c96-0cc47a0d1605
https://techcrunch.com/2020/04/20/can-employers-mandate-covid-19-testing/
http://www.borderlandbeat.com/2020/04/wuhan-was-fentanyl-capital-then.html
http://dlvr.it/RVnCp0
https://thehardtimes.net/culture/sesame-street-airs-special-episode-to-explain-coronavirus-to-the-president/
https://inspirationbylorettaexclusive.com/products/womens-im-more-cropped-t-shirt
