In [1]:
import requests
from dateutil import parser
import pandas as pd
import collections
import copy
import ast
import html
import time
from time import sleep
from datetime import datetime, timezone

In [2]:
class HTTPError(Exception):
    pass


def make_headers(bearer_token):
    headers = {'Authorization': f'Bearer {bearer_token}'}
    return headers


def request(url, headers, params):
    response = requests.get(url, headers=headers, params=params, timeout=2)
    if response.status_code != 200:
        print(f"Request returned an error: {response.status_code} {response.text}")
        raise HTTPError(response)
    return response


def get_users(user_names, headers):
    user_fields = ['name', 'id', 'created_at', 'location', 'public_metrics']
    request_params = {
        'usernames': ','.join(user_names),
        'user.fields': ','.join(user_fields)
    }
    response = request("https://api.twitter.com/2/users/by", headers, request_params)
    results = response.json()
    users = results['data']
    return users


def flatten(ls):
    for d in ls:
        d['text'] = html.unescape(d['text'])
        d['created_at'] = parser.parse(d['created_at'])
        metrics = d['public_metrics']
        for k, v in metrics.items():
            d.update({k:v})
        del d['public_metrics']
    return ls


def wait_if_exceeded(response, wait=False):
    if int(response.headers['x-rate-limit-remaining']) < 100 or wait:
        reset_unix_timestamp = int(response.headers['x-rate-limit-reset'])
        reset_timestamp = datetime.fromtimestamp(reset_unix_timestamp)
        remaining = reset_timestamp - datetime.now()
        print(f"Waiting for {remaining:.2f} seconds...")
        sleep(remaining)
    return


def get_tweets_timeline(user_names, start_time, end_time, bearer_token, max_results=5):
    headers = make_headers(bearer_token)
    try:
        users = get_users(user_names, headers)
    except Exception as e:
        print(e)
        return None, None
    tweets = []
    tweet_fields = ['lang', 'author_id', 'created_at', 'geo', 'public_metrics', 'source', 'context_annotations']
    request_params = {
        'tweet.fields':','.join(tweet_fields),
        'max_results': str(max_results),
        'start_time': start_time,
        'end_time': end_time,
        'exclude': 'retweets,replies'
    }
    stop = False
    try:
        for user in users:
            url = f"https://api.twitter.com/2/users/{user['id']}/tweets"
            nextPageToken = None
            while True:
                try:
                    response = request(url, headers, request_params)
                except HTTPError as e:
                    r, = e.args
                    if r.status_code == 429:
                        wait_if_exceeded(r, wait=True)
                    else:
                        print(f"Request returned an error: {r.status_code} {r.text}")
                        stop = True
                        break
                except Exception as e:
                    print("Something went wrong when requesting.")
                    print(e)
                    stop = True
                    break
                results = response.json()
                nextPageToken = results['meta'].get('next_token')
                data = results.get('data')
                if not nextPageToken:
                    if request_params.get('pagination_token'):
                        del request_params['pagination_token']
                    if data:
                        tweets.extend(data)
                    break
                else:
                    request_params.update({'pagination_token': nextPageToken})
                tweets.extend(data)
                wait_if_exceeded(response)
            if stop:
                break
    finally:
        return tweets, users

In [3]:
bearer_token = "apply for a Twitter dev account if you want to run this notebook"

In [4]:
tok = {'Authorization':f'Bearer {bearer_token}'}
tweet_ids = ['1263150595717730305', '1262485275348885504', '440322224407314432', '1212092628029698048']
tweet_fields = ['lang', 'author_id', 'created_at', 'geo', 'public_metrics', 'source', 'context_annotations']
fields = {
    'tweet.fields': ','.join(tweet_fields),
    'ids': ','.join(tweet_ids)
}

In [5]:
test_req = requests.get("https://api.twitter.com/2/tweets", headers=tok, params=fields)
print(test_req)
test_res = test_req.json()

for tweet in test_res['data']:
    display(tweet)
    print()

<Response [200]>


{'text': 'Do you 👀our new Tweet settings?\n\nWe want to know how and why you’d use a feature like this in the API. Get the details and let us know what you think👇\nhttps://t.co/RtMhhfAcIB https://t.co/8wxeZ9fJER',
 'id': '1263150595717730305',
 'source': 'Twitter Web App',
 'author_id': '2244994945',
 'public_metrics': {'retweet_count': 10,
  'reply_count': 13,
  'like_count': 51,
  'quote_count': 7},
 'lang': 'en',
 'created_at': '2020-05-20T16:52:24.000Z'}




{'text': 'Starting today, we’re rolling out a new look for embedded Tweets.\n\nCheck it out and share your feedback in the forum post ⬇️\nhttps://t.co/gkMD0w7mFs',
 'id': '1262485275348885504',
 'source': 'Twitter Web App',
 'author_id': '2244994945',
 'public_metrics': {'retweet_count': 1442,
  'reply_count': 7,
  'like_count': 171643,
  'quote_count': 12},
 'lang': 'en',
 'created_at': '2020-05-18T20:48:40.000Z'}




{'text': "If only Bradley's arm was longer. Best photo ever. #oscars http://t.co/C9U5NOtGap",
 'id': '440322224407314432',
 'source': 'Twitter for Android',
 'author_id': '15846407',
 'public_metrics': {'retweet_count': 2984749,
  'reply_count': 205575,
  'like_count': 2162224,
  'quote_count': 16965},
 'lang': 'en',
 'created_at': '2014-03-03T03:06:13.000Z'}




{'text': 'We believe the best future version of our API will come from building it with YOU. Here’s to another great year with everyone who builds on the Twitter platform. We can’t wait to continue working with you in the new year. https://t.co/yvxdK6aOo2',
 'id': '1212092628029698048',
 'context_annotations': [{'domain': {'id': '119',
    'name': 'Holiday',
    'description': 'Holidays like Christmas or Halloween'},
   'entity': {'id': '1186637514896920576', 'name': ' New Years Eve'}},
  {'domain': {'id': '119',
    'name': 'Holiday',
    'description': 'Holidays like Christmas or Halloween'},
   'entity': {'id': '1206982436287963136',
    'name': 'Happy New Year: It’s finally 2020 everywhere!',
    'description': 'Catch fireworks and other celebrations as people across the globe enter the new year.\nPhoto via @GettyImages '}},
  {'domain': {'id': '46',
    'name': 'Brand Category',
    'description': 'Categories within Brand Verticals that narrow down the scope of Brands'},
   'entit




In [6]:
user_names = ['nokia', 'pepsi', 'discord', 'Microsoft']
user_fields = ['id', 'created_at', 'location', 'public_metrics']
fields = {
    'usernames': ','.join(user_names),
    'user.fields': ','.join(user_fields)
}

In [7]:
user_req = requests.get("https://api.twitter.com/2/users/by", headers=tok, params=fields)
print(user_req)
user_res = user_req.json()

<Response [200]>


In [8]:
for user in user_res['data']:
    display(user)
    print()

{'name': 'Nokia',
 'location': 'Espoo, Finland',
 'public_metrics': {'followers_count': 2131192,
  'following_count': 151291,
  'tweet_count': 45806,
  'listed_count': 7936},
 'created_at': '2009-03-16T17:15:33.000Z',
 'id': '24727891',
 'username': 'nokia'}




{'name': 'Pepsi',
 'public_metrics': {'followers_count': 2999797,
  'following_count': 38366,
  'tweet_count': 69768,
  'listed_count': 6008},
 'created_at': '2008-12-15T16:26:23.000Z',
 'id': '18139619',
 'username': 'pepsi'}




{'name': 'Discord',
 'location': 'San Francisco, CA',
 'public_metrics': {'followers_count': 3335277,
  'following_count': 1091,
  'tweet_count': 462397,
  'listed_count': 3233},
 'created_at': '2015-03-07T01:00:18.000Z',
 'id': '3065618342',
 'username': 'discord'}




{'name': 'Microsoft',
 'location': 'Redmond, WA',
 'public_metrics': {'followers_count': 9684171,
  'following_count': 2534,
  'tweet_count': 24175,
  'listed_count': 25285},
 'created_at': '2009-09-14T22:35:42.000Z',
 'id': '74286565',
 'username': 'Microsoft'}




In [9]:
tweet_fields = ['lang', 'author_id', 'created_at', 'geo', 'public_metrics', 'source', 'context_annotations']
timeline_fields = {
    'tweet.fields':','.join(tweet_fields),
    'max_results': '5',
    'start_time': '2021-01-01T00:00:00Z',
    'end_time': '2021-01-10T00:00:00Z',
    'expansions': 'author_id',
    'user.fields': 'name,public_metrics',
    'exclude': 'retweets,replies'
}

In [10]:
timeline_req = requests.get('https://api.twitter.com/2/users/3065618342/tweets', headers=make_headers(bearer_token), params=timeline_fields)
print(timeline_req)
timeline_res = timeline_req.json()

<Response [200]>


In [11]:
timeline_res

{'data': [{'public_metrics': {'retweet_count': 2340,
    'reply_count': 1193,
    'like_count': 58422,
    'quote_count': 187},
   'source': 'Agorapulse app',
   'author_id': '3065618342',
   'text': 'the gangs all here https://t.co/HA7Qt3vHm4',
   'id': '1347622377056776197',
   'lang': 'en',
   'created_at': '2021-01-08T19:13:06.000Z',
   'context_annotations': [{'domain': {'id': '47',
      'name': 'Brand',
      'description': 'Brands and Companies'},
     'entity': {'id': '1168499224297656321', 'name': 'Discord'}}]},
  {'public_metrics': {'retweet_count': 353,
    'reply_count': 289,
    'like_count': 5669,
    'quote_count': 31},
   'source': 'Agorapulse app',
   'author_id': '3065618342',
   'text': 'tip of the day: click on text channel and press escape to mark individual text channel as read \n\nbonus tip: shift + esc to mark ALL channels as read https://t.co/FMhs4r9Y19',
   'id': '1346894357320921092',
   'lang': 'en',
   'created_at': '2021-01-06T19:00:13.000Z',
   'context_

In [12]:
timeline_req.encoding = 'unicode'

In [13]:
timeline_req.headers['x-rate-limit-remaining']

'1498'

In [14]:
timeline_req.headers

{'date': 'Sat, 18 Sep 2021 18:42:32 UTC', 'server': 'tsa_o', 'set-cookie': 'personalization_id="v1_2/h4mk++t6ox9UhSkjC7cw=="; Max-Age=63072000; Expires=Mon, 18 Sep 2023 18:42:32 GMT; Path=/; Domain=.twitter.com; Secure; SameSite=None, guest_id=v1%3A163199055199735969; Max-Age=63072000; Expires=Mon, 18 Sep 2023 18:42:32 GMT; Path=/; Domain=.twitter.com; Secure; SameSite=None', 'api-version': '2.24', 'content-type': 'application/json; charset=utf-8', 'cache-control': 'no-cache, no-store, max-age=0', 'content-length': '775', 'x-access-level': 'read', 'x-frame-options': 'SAMEORIGIN', 'content-encoding': 'gzip', 'x-xss-protection': '0', 'x-rate-limit-limit': '1500', 'x-rate-limit-reset': '1631991376', 'content-disposition': 'attachment; filename=json.json', 'x-content-type-options': 'nosniff', 'x-rate-limit-remaining': '1498', 'strict-transport-security': 'max-age=631138519', 'x-connection-hash': '9aa998527f40eaba6e317183f93d613a0b538db5a67a288a8136c7dd35002a27'}

In [15]:
remaining = datetime.fromtimestamp(int(timeline_req.headers['x-rate-limit-reset'])) - datetime.now()

In [16]:
remaining.total_seconds()

824.376159

In [17]:
datetime.utcfromtimestamp(int(timeline_req.headers['x-rate-limit-reset']))

datetime.datetime(2021, 9, 18, 18, 56, 16)

In [18]:
datetime.utcfromtimestamp(int(timeline_req.headers['x-rate-limit-reset']))

datetime.datetime(2021, 9, 18, 18, 56, 16)

In [19]:
for item in timeline_res['data']:
    display(item)
    print()

print(timeline_res['includes'])
print(timeline_res['meta'])

{'public_metrics': {'retweet_count': 2340,
  'reply_count': 1193,
  'like_count': 58422,
  'quote_count': 187},
 'source': 'Agorapulse app',
 'author_id': '3065618342',
 'text': 'the gangs all here https://t.co/HA7Qt3vHm4',
 'id': '1347622377056776197',
 'lang': 'en',
 'created_at': '2021-01-08T19:13:06.000Z',
 'context_annotations': [{'domain': {'id': '47',
    'name': 'Brand',
    'description': 'Brands and Companies'},
   'entity': {'id': '1168499224297656321', 'name': 'Discord'}}]}




{'public_metrics': {'retweet_count': 353,
  'reply_count': 289,
  'like_count': 5669,
  'quote_count': 31},
 'source': 'Agorapulse app',
 'author_id': '3065618342',
 'text': 'tip of the day: click on text channel and press escape to mark individual text channel as read \n\nbonus tip: shift + esc to mark ALL channels as read https://t.co/FMhs4r9Y19',
 'id': '1346894357320921092',
 'lang': 'en',
 'created_at': '2021-01-06T19:00:13.000Z',
 'context_annotations': [{'domain': {'id': '47',
    'name': 'Brand',
    'description': 'Brands and Companies'},
   'entity': {'id': '1168499224297656321', 'name': 'Discord'}}]}




{'public_metrics': {'retweet_count': 8349,
  'reply_count': 4608,
  'like_count': 133332,
  'quote_count': 2169},
 'source': 'Agorapulse app',
 'author_id': '3065618342',
 'text': 'maybe this is the year some of you finally learn how to use push to talk',
 'id': '1346176089169809410',
 'lang': 'en',
 'created_at': '2021-01-04T19:26:05.000Z',
 'context_annotations': [{'domain': {'id': '47',
    'name': 'Brand',
    'description': 'Brands and Companies'},
   'entity': {'id': '1168499224297656321', 'name': 'Discord'}}]}




{'public_metrics': {'retweet_count': 614,
  'reply_count': 612,
  'like_count': 24715,
  'quote_count': 52},
 'source': 'Twitter Web App',
 'author_id': '3065618342',
 'text': "some1 from our art team made this and i'm just posting it https://t.co/ubkQ6SX8TO",
 'id': '1345446715168444417',
 'lang': 'en',
 'created_at': '2021-01-02T19:07:48.000Z',
 'context_annotations': [{'domain': {'id': '47',
    'name': 'Brand',
    'description': 'Brands and Companies'},
   'entity': {'id': '1168499224297656321', 'name': 'Discord'}}]}


{'users': [{'public_metrics': {'followers_count': 3335277, 'following_count': 1091, 'tweet_count': 462397, 'listed_count': 3233}, 'id': '3065618342', 'name': 'Discord', 'username': 'discord'}]}
{'oldest_id': '1345446715168444417', 'newest_id': '1347622377056776197', 'result_count': 4}


In [20]:
timeline_req.content

b'{"data":[{"public_metrics":{"retweet_count":2340,"reply_count":1193,"like_count":58422,"quote_count":187},"source":"Agorapulse app","author_id":"3065618342","text":"the gangs all here https://t.co/HA7Qt3vHm4","id":"1347622377056776197","lang":"en","created_at":"2021-01-08T19:13:06.000Z","context_annotations":[{"domain":{"id":"47","name":"Brand","description":"Brands and Companies"},"entity":{"id":"1168499224297656321","name":"Discord"}}]},{"public_metrics":{"retweet_count":353,"reply_count":289,"like_count":5669,"quote_count":31},"source":"Agorapulse app","author_id":"3065618342","text":"tip of the day: click on text channel and press escape to mark individual text channel as read \\n\\nbonus tip: shift + esc to mark ALL channels as read https://t.co/FMhs4r9Y19","id":"1346894357320921092","lang":"en","created_at":"2021-01-06T19:00:13.000Z","context_annotations":[{"domain":{"id":"47","name":"Brand","description":"Brands and Companies"},"entity":{"id":"1168499224297656321","name":"Disc

In [21]:
user_names = ['discord']
tweets, users = get_tweets_timeline(user_names, '2021-01-01T00:00:00Z', '2021-01-10T00:00:00Z', bearer_token, max_results=100)

In [22]:
tweets_orig = copy.deepcopy(tweets)
tweets_orig

[{'created_at': '2021-01-08T19:13:06.000Z',
  'source': 'Agorapulse app',
  'public_metrics': {'retweet_count': 2340,
   'reply_count': 1193,
   'like_count': 58422,
   'quote_count': 187},
  'text': 'the gangs all here https://t.co/HA7Qt3vHm4',
  'context_annotations': [{'domain': {'id': '47',
     'name': 'Brand',
     'description': 'Brands and Companies'},
    'entity': {'id': '1168499224297656321', 'name': 'Discord'}}],
  'lang': 'en',
  'author_id': '3065618342',
  'id': '1347622377056776197'},
 {'created_at': '2021-01-06T19:00:13.000Z',
  'source': 'Agorapulse app',
  'public_metrics': {'retweet_count': 353,
   'reply_count': 289,
   'like_count': 5669,
   'quote_count': 31},
  'text': 'tip of the day: click on text channel and press escape to mark individual text channel as read \n\nbonus tip: shift + esc to mark ALL channels as read https://t.co/FMhs4r9Y19',
  'context_annotations': [{'domain': {'id': '47',
     'name': 'Brand',
     'description': 'Brands and Companies'},
   

In [23]:
print(len(tweets))

4


In [24]:
df = pd.DataFrame(flatten(tweets))
df

Unnamed: 0,created_at,source,text,context_annotations,lang,author_id,id,retweet_count,reply_count,like_count,quote_count
0,2021-01-08 19:13:06+00:00,Agorapulse app,the gangs all here https://t.co/HA7Qt3vHm4,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",en,3065618342,1347622377056776197,2340,1193,58422,187
1,2021-01-06 19:00:13+00:00,Agorapulse app,tip of the day: click on text channel and pres...,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",en,3065618342,1346894357320921092,353,289,5669,31
2,2021-01-04 19:26:05+00:00,Agorapulse app,maybe this is the year some of you finally lea...,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",en,3065618342,1346176089169809410,8349,4608,133332,2169
3,2021-01-02 19:07:48+00:00,Twitter Web App,some1 from our art team made this and i'm just...,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",en,3065618342,1345446715168444417,614,612,24715,52


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype                  
---  ------               --------------  -----                  
 0   created_at           4 non-null      datetime64[ns, tzutc()]
 1   source               4 non-null      object                 
 2   text                 4 non-null      object                 
 3   context_annotations  4 non-null      object                 
 4   lang                 4 non-null      object                 
 5   author_id            4 non-null      object                 
 6   id                   4 non-null      object                 
 7   retweet_count        4 non-null      int64                  
 8   reply_count          4 non-null      int64                  
 9   like_count           4 non-null      int64                  
 10  quote_count          4 non-null      int64                  
dtypes: datetime64[ns, tzutc()](1), int64

In [26]:
# df.to_csv("discord.csv", index=False)

In [27]:
# df = pd.read_csv("discord.csv")
# df['context_annotations'] = df['context_annotations'].apply(lambda x: ast.literal_eval(x))