In [1]:
import pandas as pd
import numpy as np
import os
from requests_oauthlib import OAuth1Session
import json
import constants

In [2]:
# Contains the API key information that will be needed for querying the Twitter API
# NOTE: You will need your own constants.py file with these two variables in it.
API_KEY_SEC = constants.API_KEY_SEC
API_KEY = constants.API_KEY

# Birdwatch Data Loader
This file contains the process of getting all US English Tweets from the Birdwatch Program that were marked as misinformation. Data was obtained using the Birdwatch notes data from Twitter's website and using the Twitters APIs tweet endpoint.

In [3]:
birdwatch = pd.read_csv("./Data/notes-00000.tsv", sep="\t")

In [4]:
bool_idx = birdwatch["classification"] == "MISINFORMED_OR_POTENTIALLY_MISLEADING"
birdwatch = birdwatch.loc[bool_idx,["tweetId","classification","noteId"]]
birdwatch

Unnamed: 0,tweetId,classification,noteId
0,1377030478167937024,MISINFORMED_OR_POTENTIALLY_MISLEADING,1537142913737428992
2,1537080831751102467,MISINFORMED_OR_POTENTIALLY_MISLEADING,1537147343715282945
3,1537196168953974784,MISINFORMED_OR_POTENTIALLY_MISLEADING,1537204430730211328
4,1540087463099736065,MISINFORMED_OR_POTENTIALLY_MISLEADING,1540422295029551104
5,1535062308426510337,MISINFORMED_OR_POTENTIALLY_MISLEADING,1535128588818653184
...,...,...,...
34245,1529939852028002329,MISINFORMED_OR_POTENTIALLY_MISLEADING,1530025194546413589
34246,1529939819878662177,MISINFORMED_OR_POTENTIALLY_MISLEADING,1530026164777009175
34247,1530481054015946754,MISINFORMED_OR_POTENTIALLY_MISLEADING,1530645389988319239
34248,1530480723953651712,MISINFORMED_OR_POTENTIALLY_MISLEADING,1530646245865291777


In [5]:
# We are getting all the tweetIDs of the Misinformed Tweets to obtain their text
sample = birdwatch["tweetId"]

The Twitter API only allows for searching 100 tweets at a time in the form of a comma seperated string. Because of this, the following loop turns our sample into a list containing strings of 100 comma seperated IDs.

In [6]:
search_arr = []
# for i in range(10):
for i in range(birdwatch.shape[0]//100 + 1):
    search_str = ""
    tweet = sample[(i*100):(i*100 + 100)]
    for ele in tweet:
        search_str += str(ele) + ","
    search_str = search_str[:-1]
    search_arr += [search_str]

This code is Twitter's authentication code for python so it can largely be ignored

**Source: https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/main/Tweet-Lookup/get_tweets_with_user_context.py**

In [7]:
# To set your enviornment variables in your terminal run the following line:
consumer_key = API_KEY
consumer_secret = API_KEY_SEC

request_token_url = "https://api.twitter.com/oauth/request_token"
oauth = OAuth1Session(consumer_key, client_secret=consumer_secret)

try:
    fetch_response = oauth.fetch_request_token(request_token_url)
except ValueError:
    print(
        "There may have been an issue with the consumer_key or consumer_secret you entered."
    )

resource_owner_key = fetch_response.get("oauth_token")
resource_owner_secret = fetch_response.get("oauth_token_secret")
print("Got OAuth token: %s" % resource_owner_key)

# Get authorization
base_authorization_url = "https://api.twitter.com/oauth/authorize"
authorization_url = oauth.authorization_url(base_authorization_url)
print("Please go here and authorize: %s" % authorization_url)
verifier = input("Paste the PIN here: ")

# Get the access token
access_token_url = "https://api.twitter.com/oauth/access_token"
oauth = OAuth1Session(
    consumer_key,
    client_secret=consumer_secret,
    resource_owner_key=resource_owner_key,
    resource_owner_secret=resource_owner_secret,
    verifier=verifier,
)
oauth_tokens = oauth.fetch_access_token(access_token_url)


access_token = oauth_tokens["oauth_token"]
access_token_secret = oauth_tokens["oauth_token_secret"]

# Make the request
oauth = OAuth1Session(
    consumer_key,
    client_secret=consumer_secret,
    resource_owner_key=access_token,
    resource_owner_secret=access_token_secret,
)

Got OAuth token: Yn1TmwAAAAABcJlTAAABgd9OGzU
Please go here and authorize: https://api.twitter.com/oauth/authorize?oauth_token=Yn1TmwAAAAABcJlTAAABgd9OGzU
Paste the PIN here: 0540094


This code calls the API for each of the 100 tweet long strings in our list containing all of these strings. This then creates an array of dataframes that we can concatenate into one large dataframe. Note not all tweets are able to be obtained likely because of changed accessing rights on the tweet itself.

We are also using twitter's expansions option to obtain location data on tweets in order to make sure they are in the US.

In [8]:
API_Tweets = []
API_Locations = []

for i in range((birdwatch.shape[0]//100) +1):
    params = {"ids": search_arr[i],"expansions": "geo.place_id","tweet.fields": "created_at,author_id,entities,lang","place.fields":"country"}
    response = oauth.get(
        "https://api.twitter.com/2/tweets", params=params
    )
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(response.status_code, response.text)
        )
    json_response = response.json()
    if any(list(map(lambda x: x.get("geo",False),json_response["data"]))):
        dataframed_response = pd.DataFrame(json_response["data"])[["id","entities","author_id","text","created_at",'lang','geo']]
        API_Tweets += [dataframed_response]
    if json_response.get("includes"):
        API_Locations += json_response["includes"]["places"]

In [9]:
tweet_locations = pd.DataFrame(API_Locations)[["country","id"]]
place_id_filter = {ele[0]: ele[1] for ele in list(zip(tweet_locations["id"],tweet_locations["country"]))}

{'f9c0877820b7848a': 'United States',
 '1927193c57f35d51': 'United States',
 '01fbe706f872cb32': 'United States',
 '011add077f4d2da3': 'United States',
 '319ee7b36c9149da': 'United States',
 'dce44ec49eb788f5': 'United States',
 '5e02a0f0d91c76d2': 'Turkey',
 '084d0d0155787e9d': 'Ukraine',
 '09d38e8e348e39ba': 'United States',
 'd6819fe60643ebc1': 'United States',
 '01a9a39529b27f36': 'United States',
 'e4a0d228eb6be76b': 'United States',
 '7325f3442fd87621': 'United States',
 '2d83c71ce16cd187': 'United States',
 '24181cf5031a5ef6': 'United States',
 '5a110d312052166f': 'United States',
 'e5797011cad97adf': 'United States',
 'e564d30dc173d2a8': 'South Africa',
 '5b02fa2a078d954a': 'Canada',
 '480d6a1669a2f246': 'United States',
 '17a83a55221b0671': 'United States',
 '18810aa5b43e76c7': 'United States',
 '04cb31bae3b3af93': 'United States',
 'c3a6437e1b1a726d': 'Switzerland',
 '00c39537733fa112': 'United States',
 '317fcc4b21a604d5': 'India',
 '3b77caf94bfc81fe': 'United States',
 'e00

In [10]:
# Concat all of the returned tweet dataframes 
tweet_text = pd.concat(API_Tweets)
tweet_text = tweet_text.dropna(subset=["geo"])
# Author IDs needs to be numeric for merging purposes
tweet_text["author_id"] = tweet_text["author_id"].astype(int)
tweet_text["id"] = tweet_text["id"].astype(int)

# Get the hashtags from the entities dictionaries and convert it into a list of hashtage
tweet_text["hashtags"] = (tweet_text["entities"]
                          .apply(lambda x: [ele['tag'] for ele in x.get("hashtags",[])] if type(x) == dict else []))

# Get all of the value names from the geo dictionary inorder to make the columns
public_metrics_cols = list(tweet_text["geo"].iloc[0].keys())
public_metrics_cols

# Create the columns containing the geo data
for col in public_metrics_cols:
    tweet_text[col] = (tweet_text["geo"]
                           .apply(lambda x: x.get(col,0)))
    
tweet_text["country"] = tweet_text["place_id"].apply(lambda my_id: place_id_filter[my_id])
    
# Convert the created_at value to datetime
tweet_text["created_at"] = pd.to_datetime(tweet_text["created_at"])

# Drop the columns with dictionaries
tweet_text = tweet_text.drop(columns=["entities","geo","place_id"])
tweet_text = tweet_text.reset_index().drop(columns="index")
tweet_text = tweet_text.drop_duplicates(subset="id")

tweet_text.head()

Unnamed: 0,id,author_id,text,created_at,lang,hashtags,country
0,1521300212412678144,2219695045,I haven’t seen Democrats this mad since the pa...,2022-05-03 01:26:49+00:00,en,"[RoeOverturned, SCOTUS]",United States
1,1521842639464648708,2234760798,Stephen Colbert: “If these folks believe Roe w...,2022-05-04 13:22:14+00:00,en,[],United States
2,1539244666385727489,9864482,"@AnupamChander Anupam, the fact that you’re so...",2022-06-21 13:51:40+00:00,en,[],United States
3,1356597689480335361,21334179,To everyone asking why this matters: If Psaki ...,2021-02-02 13:37:48+00:00,en,[],United States
4,1448979628882804746,59482886,And the big question? Who is this Chinese ma...,2021-10-15 11:50:38+00:00,en,[],United States


In [11]:
birdwatch_tweets = birdwatch.merge(tweet_text,left_on="tweetId",right_on="id")
birdwatch_tweets = birdwatch_tweets[(birdwatch_tweets["lang"] == "en") & (birdwatch_tweets["country"] == "United States")]
birdwatch_tweets.head()

Unnamed: 0,tweetId,classification,noteId,id,author_id,text,created_at,lang,hashtags,country
0,1521300212412678144,MISINFORMED_OR_POTENTIALLY_MISLEADING,1521630401508220928,1521300212412678144,2219695045,I haven’t seen Democrats this mad since the pa...,2022-05-03 01:26:49+00:00,en,"[RoeOverturned, SCOTUS]",United States
1,1521842639464648708,MISINFORMED_OR_POTENTIALLY_MISLEADING,1521946220050018305,1521842639464648708,2234760798,Stephen Colbert: “If these folks believe Roe w...,2022-05-04 13:22:14+00:00,en,[],United States
2,1521842639464648708,MISINFORMED_OR_POTENTIALLY_MISLEADING,1521943616905695235,1521842639464648708,2234760798,Stephen Colbert: “If these folks believe Roe w...,2022-05-04 13:22:14+00:00,en,[],United States
3,1521842639464648708,MISINFORMED_OR_POTENTIALLY_MISLEADING,1521912043477078019,1521842639464648708,2234760798,Stephen Colbert: “If these folks believe Roe w...,2022-05-04 13:22:14+00:00,en,[],United States
4,1539244666385727489,MISINFORMED_OR_POTENTIALLY_MISLEADING,1539458360386506752,1539244666385727489,9864482,"@AnupamChander Anupam, the fact that you’re so...",2022-06-21 13:51:40+00:00,en,[],United States


In [18]:
csv_file = birdwatch_tweets[["noteId","id","text","hashtags","classification"]]
csv_file.head()

Unnamed: 0,noteId,id,text,hashtags,classification
0,1521630401508220928,1521300212412678144,I haven’t seen Democrats this mad since the pa...,"[RoeOverturned, SCOTUS]",MISINFORMED_OR_POTENTIALLY_MISLEADING
1,1521946220050018305,1521842639464648708,Stephen Colbert: “If these folks believe Roe w...,[],MISINFORMED_OR_POTENTIALLY_MISLEADING
2,1521943616905695235,1521842639464648708,Stephen Colbert: “If these folks believe Roe w...,[],MISINFORMED_OR_POTENTIALLY_MISLEADING
3,1521912043477078019,1521842639464648708,Stephen Colbert: “If these folks believe Roe w...,[],MISINFORMED_OR_POTENTIALLY_MISLEADING
4,1539458360386506752,1539244666385727489,"@AnupamChander Anupam, the fact that you’re so...",[],MISINFORMED_OR_POTENTIALLY_MISLEADING


In [23]:
csv_file[["id","text"]].drop_duplicates().to_csv("./Data/unique_tweets.csv",index=False)

In [27]:
pd.read_csv("./Data/unique_tweets.csv")

Unnamed: 0,id,text
0,1521300212412678144,I haven’t seen Democrats this mad since the pa...
1,1521842639464648708,Stephen Colbert: “If these folks believe Roe w...
2,1539244666385727489,"@AnupamChander Anupam, the fact that you’re so..."
3,1356597689480335361,To everyone asking why this matters: If Psaki ...
4,1448979628882804746,And the big question? Who is this Chinese ma...
...,...,...
198,1475562179256471552,If you’re the President of the United States o...
199,1529427654839046144,I have news for the embarrassment that claims ...
200,1532710109201190912,More from Oak Park River Forest HS and its rac...
201,1539673370790436866,@greg_price11 Don’t blame lol companies. Biden...
