# v2 API

In [None]:
pip install twarc

In [None]:
pip install pandas

In [None]:
pip install matplotlib

In [None]:
from twarc import Twarc2
from datetime import datetime, timezone
import pandas as pd
import matplotlib.pyplot as plt

## Authentification

In [None]:
# NOTE: these are my personal credentials. For this code to work on your 
# computer, you will need a file named "API_credentials.txt" in the same 
# directory as this script, with the information stored in the form
# access_token=VVV
# access_token_secret=WWW
# consumer_key=XXX
# consumer_secret=YYY
# bearer_token=ZZZ

# For the following it's sufficient if you have just a bearer_token

credentials = {}
with open('API_credentials_V2.txt', 'r') as f:
    for line in f:
        credentials[line.split('=')[0]] = line.split('=')[1].strip('\n')
        
access_token = credentials['access_token']
access_token_secret = credentials['access_token_secret']
consumer_key = credentials['consumer_key']
consumer_secret = credentials['consumer_secret']
bearer_token = credentials['bearer_token']

In [None]:
# alternatively, you can also paste the info from the app you just created here
# see https://developer.twitter.com/en/portal/projects-and-apps
access_token = 'VVV'
access_token_secret = 'WWW'
consumer_key = 'XXX'
consumer_secret = 'YYY'
bearer_token = 'ZZZ'

In [None]:
# user authentification: managing tweets
twarc_client = Twarc2(
    consumer_key=consumer_key,
    consumer_secret=consumer_secret,
    access_token=access_token,
    access_token_secret=access_token_secret
)

In [None]:
# app authentification: getting data
twarc_client = Twarc2(bearer_token=bearer_token)

Different API [rate limits](https://developer.twitter.com/en/docs/twitter-api/rate-limits) apply to user and app authentification.

## Search

In [None]:
# empty list to store the search results
tweets = [] 

# tweets we look for should contain the following search string
search_string = '#greatresignation'

start = datetime(2022, 3, 4, 0, 0, 0, 0, tzinfo=timezone.utc)
end = datetime(2022, 3, 5, 0, 0, 0, 0, tzinfo=timezone.utc)

# search Twitter for tweets containing the search string in the specified time
# window and store all the results in the list
for tweet in twarc_client.search_all(
        search_string,
        start_time=start,
        end_time=end):
    tweets.append(tweet)

In [None]:
# pagination
tweets

In [None]:
# tweets are returned in batches
len(tweets[0]["data"])

In [None]:
# peek into the data
tweets[0]['data'][1]['text']

In [None]:
# number of returned batches
len(tweets)

[Documentation](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) on how to build more sophisticated queries. 

## Users

In [None]:
users = []

user_names = ['liveresignation']

for user in twarc_client.user_lookup(
        user_names,
        usernames=True):
    users.append(user)

In [None]:
users[0]['data'][0].keys()

In [None]:
users[0]['data'][0]['description']

In [None]:
users[0]['data'][0]['public_metrics']['followers_count']

## Data fields

In [None]:
# The API returns JSON objects which are parsed as dictionaries in Python.
# Dictionaries contain pairs of (key, value), where "key" is the name of a 
# "data field", such as "id" for the Tweet ID, and "value" contains the value
# of the specific data field
tweets[0]["data"][0].keys()

In [None]:
tweets[0]["data"][0]["id"]

In [None]:
tweets[0]["data"][0]["text"]

In [None]:
# Different API endpoints return different JSON objects, depending on whether
# they return Tweet or User objects
users[0]["data"][0].keys()

In [None]:
users[0]["data"][0]["id"]

In [None]:
users[0]["data"][0]["username"]

## Counts

In [None]:
day_count = []
start = datetime.strptime('2021-01-01', '%Y-%m-%d')
end = datetime.strptime('2022-03-05', '%Y-%m-%d')
search_string = '#greatresignation'

for c in twarc_client.counts_all(
    search_string, 
    start_time=start, 
    end_time=end, 
    granularity='day'):
    
    day_count.extend(c['data'])

In [None]:
day_count[0:3]

In [None]:
counts = pd.DataFrame()
for day in day_count:
    counts = counts.append(day, ignore_index=True)
    
counts.head(3)

In [None]:
counts = counts.sort_values(by='start')
counts['start'] = pd.to_datetime(counts['start'])
counts['end'] = pd.to_datetime(counts['end'])
counts.head(3)

In [None]:
fix, ax = plt.subplots(figsize=(10, 4))
ax.plot(counts['start'], counts['tweet_count'])

ticks = ['2021-01-01', '2021-04-01', '2021-07-01',
         '2021-10-01', '2022-01-01']
ax.set_xticks([pd.to_datetime(tick) for tick in ticks])
ax.set_ylabel('tweet count', fontsize=16)
ax.set_title('tweets containing {}'.format(search_string), fontsize=20);

In [None]:
def get_counts(client, search_string, start, end):
    '''
    Gets the daily tweet counts between a start time and an end
    time given a search string, using Twarc to access the Twitter
    v2 API.
    '''
    print(f"fetching counts for {search_string}")
    
    counts = pd.DataFrame()
    for c in client.counts_all(
        search_string,
        start_time=start,
        end_time=end,
        granularity='day'):
        
        counts = counts.append(c['data'], ignore_index=False)
    
    counts['start'] = pd.to_datetime(counts['start'])
    counts['end'] = pd.to_datetime(counts['end'])
    counts = counts\
        .sort_values(by='start')\
        .reset_index(drop=True)
    
    return counts

In [None]:
start = datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=timezone.utc)
end = datetime(2022, 3, 1, 0, 0, 0, 0, tzinfo=timezone.utc)

counts_work = get_counts(twarc_client, '#work', start, end)
counts_mentalhealth = get_counts(twarc_client, '#mentalhealth', start, end)
counts_greatresignation = get_counts(twarc_client, '#greatresignation', start, end)

In [None]:
fix, ax = plt.subplots(figsize=(10, 4))

for df, label in zip([counts_work, counts_mentalhealth, counts_greatresignation],
                     ['#work', '#mentalhealth', '#greatresignation']):
    ax.plot(df['start'], df['tweet_count'], label=label)

ax.set_xticks([pd.to_datetime(tick) for tick in ticks])
ax.set_ylabel('daily tweet count', fontsize=16)
ax.legend(fontsize=16)
#ax.set_yscale('log')

## Timeline

In [None]:
tweets = []
start = datetime(2022, 3, 1, 0, 0, 0, 0, tzinfo=timezone.utc)
end = datetime(2022, 3, 5, 0, 0, 0, 0, tzinfo=timezone.utc)

# instead of the "search" endpoint, we now use the "timeline" endpoint to
# retrieve all Tweets by a given user (identified by their user name)
for tweet in twarc_client.timeline(
        'liveresignation',
        start_time=start,
        end_time=end):
    tweets.append(tweet)

In [None]:
tweets[0]['data'][0]['text']

## Followers

In [None]:
# if we are interested in the profiles of the followers of a given account,
# we can fetch these using the "followers" endpoint
followers = []
for user in twarc_client.followers('liveresignation'):
    followers.append(user)

In [None]:
followers[0]["data"][0]["username"]

## Filtered stream

In [None]:
# with the filtered stream endpoint we can fetch tweets matching given search
# rules as they are created in real time
twarc_client.add_stream_rules([{"value":"work"}])

In [None]:
twarc_client.get_stream_rules()

In [None]:
twarc_client.delete_stream_rule_ids()

In [None]:
tweets = []

# keep the stream up until some end condition is met
while True:
    for tweet in twarc_client.stream():
        # append new tweets to the list of tweets
        tweets.append(tweet)
        
        # stop after 10 tweets have been received
        if len(tweets) >= 10:
            break
    break       

In [None]:
tweets[0]["data"]["text"]

## API limitations

**V2 API & developer access**
* Only tweets from the last 7 days accessible -> look into streaming tweets if you want more.
* 500k tweets / month

**V2 API & academic access**
* Full archival search
* 10 mio tweets / month

**Rate limits**
* Usually not a problem when downloading tweets with `search`, `timeline`, `count` etc.
* Can be a huge problem when scraping follower networks (15 requests á 15 minutes with 100 followers each).

**Quota**
* 500k tweets is very little
* 10 mio tweets is also not that much -> helping each other out with API quota

## Further reading

* Twarc [documentation](https://twarc-project.readthedocs.io/en/latest/api/client/#twarc.client)
* Twitter API [code samples](https://github.com/twitterdev/Twitter-API-v2-sample-code)
* Access level [documentation](https://developer.twitter.com/en/docs/twitter-api/getting-started/about-twitter-api)

Twarc alternatives
* [tweepy](https://www.tweepy.org/) (Python)
* [twitteR](https://www.rdocumentation.org/packages/twitteR/versions/1.1.9) (R)
* Using twarc from the command line [https://twarc-project.readthedocs.io/en/latest/](https://twarc-project.readthedocs.io/en/latest/).