In [1]:
# requests to the API
import requests
import os
import json
import pandas as pd
import csv
import datetime
import dateutil.parser
import unicodedata
import time

In [2]:
#setting the BEARER TOKEN as an input variable
os.environ['TOKEN'] = #'BEARER TOKEN'#

In [3]:
#gFunction to retrieve the token from the environment
def auth():
    return os.getenv('TOKEN')

In [4]:
#Function to request the endpoint and the parameters
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [5]:
def create_url(keyword, start_date, end_date, max_results):
    
    search_url = "https://api.twitter.com/2/tweets/search/all" #full-archive search endpoint
    #defining parameters
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

In [6]:
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [7]:
#Inputs for the request
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = "(abduct OR abuse OR arrested OR arson OR assault OR burglary OR charged OR convicted OR crime OR cybercrime OR fraud OR homicide OR investigation OR jailed OR kidnap OR knifecrime OR manslaughter OR molestation OR murder police OR rape OR robbery OR seized OR sentenced OR shooting OR smuggle OR smuggling OR stabbing OR theft OR trafficking OR vandalism) place_country:GB has:geo -is:nullcast -is:retweet lang:en"
start_time = "2021-01-01T00:00:00.000Z"
end_time = "2021-12-31T00:00:00.000Z"
max_results = 500

In [8]:
url = create_url(keyword, start_time,end_time, max_results)
json_response = connect_to_endpoint(url[0], headers, url[1])

Endpoint Response Code: 200


In [9]:
print(json.dumps(json_response, indent=4))

{
    "data": [
        {
            "id": "1476703915957698564",
            "author_id": "325597286",
            "created_at": "2021-12-30T23:57:04.000Z",
            "conversation_id": "1476703915957698564",
            "entities": {
                "urls": [
                    {
                        "start": 88,
                        "end": 111,
                        "url": "https://t.co/824Pkc2a17",
                        "expanded_url": "https://twitter.com/Man_of_Rohan_/status/1476703915957698564/photo/1",
                        "display_url": "pic.twitter.com/824Pkc2a17",
                        "media_key": "3_1476703913906692101"
                    }
                ]
            },
            "text": "Tell me you're guilty of sexual abuse without telling me you're guilty of sexual abuse. https://t.co/824Pkc2a17",
            "source": "Twitter for Android",
            "lang": "en",
            "public_metrics": {
                "retweet_count": 2,
           

In [10]:
# Create file
csvFile = open("TWEETS.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
csvWriter.writerow(['author id', 'created_at', 'geo','id','language','source','tweet', 'location', 'bbox'])
csvFile.close()

In [11]:
def append_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation - placeid
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 5. Tweet ID
        tweet_id = tweet['id']

        # 6. Language
        lang = tweet['lang']

        # 7. source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']
                
        # 9. Location name
        for place in json_response['includes']['places']:
            
            if (tweet['geo']['place_id'] == place['id']):
                location = place['full_name']
                bbox = place['geo']['bbox']
        
        # Assemble all data in a list
        res = [author_id, created_at, geo, tweet_id, lang, source, text, location, bbox]

        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

In [12]:
#Inputs for tweets
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = "(abduct OR abuse OR arrested OR arson OR assault OR burglary OR charged OR convicted OR crime OR cybercrime OR fraud OR homicide OR investigation OR jailed OR kidnap OR knifecrime OR manslaughter OR molestation OR murder police OR rape OR robbery OR seized OR sentenced OR shooting OR smuggle OR smuggling OR stabbing OR theft OR trafficking OR vandalism) place_country:GB has:geo -is:nullcast -is:retweet lang:en"
start_list =    ['2021-01-01T00:00:00.000Z',
                 '2021-02-01T00:00:00.000Z',
                 '2021-03-01T00:00:00.000Z',
                 '2021-04-01T00:00:00.000Z',
                 '2021-05-01T00:00:00.000Z',
                 '2021-06-01T00:00:00.000Z',
                 '2021-07-01T00:00:00.000Z',
                 '2021-08-01T00:00:00.000Z',
                 '2021-09-01T00:00:00.000Z',
                 '2021-10-01T00:00:00.000Z',
                 '2021-11-01T00:00:00.000Z',
                 '2021-12-01T00:00:00.000Z']

end_list =      ['2021-01-31T00:00:00.000Z',
                 '2021-02-28T00:00:00.000Z',
                 '2021-03-31T00:00:00.000Z',
                 '2021-04-30T00:00:00.000Z',
                 '2021-05-31T00:00:00.000Z',
                 '2021-06-30T00:00:00.000Z',
                 '2021-07-31T00:00:00.000Z',
                 '2021-08-31T00:00:00.000Z',
                 '2021-09-30T00:00:00.000Z',
                 '2021-10-31T00:00:00.000Z',
                 '2021-11-30T00:00:00.000Z',
                 '2021-12-31T00:00:00.000Z']
max_results = 500

#Total number of tweets we collected from the loop
total_tweets = 0

# Create file
csvFile = open("TWEETS.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)


for i in range(0,len(start_list)):

    # Inputs
    count = 0 # Counting tweets per time period
    max_count = 50000 # Max tweets per time period
    flag = True
    next_token = None
    
    # Check if flag is true
    while flag:
        # Check if max_count reached
        if count >= max_count:
            break
        
        url = create_url(keyword, start_list[i],end_list[i], max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        result_count = json_response['meta']['result_count']

        if 'next_token' in json_response['meta']:
            # Save the token to use for next call
            next_token = json_response['meta']['next_token']
            
            if result_count is not None and result_count > 0 and next_token is not None:
                
                append_to_csv(json_response, "TWEETS.csv")
                count += result_count
                total_tweets += result_count
                
                time.sleep(5)                
        # If no next token exists
        else:
            if result_count is not None and result_count > 0:
                
                append_to_csv(json_response, "TWEETS.csv")
                count += result_count
                total_tweets += result_count
                
                time.sleep(5)
            
            #Since this is the final request, turn flag to false to move to the next time period.
            flag = False
            next_token = None
        time.sleep(5)


Endpoint Response Code: 200
# of Tweets added from this response:  480
Endpoint Response Code: 200
# of Tweets added from this response:  487
Endpoint Response Code: 200
# of Tweets added from this response:  489
Endpoint Response Code: 200
# of Tweets added from this response:  484
Endpoint Response Code: 200
# of Tweets added from this response:  484
Endpoint Response Code: 200
# of Tweets added from this response:  484
Endpoint Response Code: 200
# of Tweets added from this response:  488
Endpoint Response Code: 200
# of Tweets added from this response:  489
Endpoint Response Code: 200
# of Tweets added from this response:  481
Endpoint Response Code: 200
# of Tweets added from this response:  482
Endpoint Response Code: 200
# of Tweets added from this response:  488
Endpoint Response Code: 200
# of Tweets added from this response:  473
Endpoint Response Code: 200
# of Tweets added from this response:  480
Endpoint Response Code: 200
# of Tweets added from this response:  482
Endpoi

Endpoint Response Code: 200
# of Tweets added from this response:  486
Endpoint Response Code: 200
# of Tweets added from this response:  474
Endpoint Response Code: 200
# of Tweets added from this response:  482
Endpoint Response Code: 200
# of Tweets added from this response:  486
Endpoint Response Code: 200
# of Tweets added from this response:  488
Endpoint Response Code: 200
# of Tweets added from this response:  480
Endpoint Response Code: 200
# of Tweets added from this response:  490
Endpoint Response Code: 200
# of Tweets added from this response:  490
Endpoint Response Code: 200
# of Tweets added from this response:  489
Endpoint Response Code: 200
# of Tweets added from this response:  492
Endpoint Response Code: 200
# of Tweets added from this response:  489
Endpoint Response Code: 200
# of Tweets added from this response:  483
Endpoint Response Code: 200
# of Tweets added from this response:  483
Endpoint Response Code: 200
# of Tweets added from this response:  485
Endpoi

Endpoint Response Code: 200
# of Tweets added from this response:  487
Endpoint Response Code: 200
# of Tweets added from this response:  488
Endpoint Response Code: 200
# of Tweets added from this response:  481
Endpoint Response Code: 200
# of Tweets added from this response:  489
Endpoint Response Code: 200
# of Tweets added from this response:  482
Endpoint Response Code: 200
# of Tweets added from this response:  484
Endpoint Response Code: 200
# of Tweets added from this response:  479
Endpoint Response Code: 200
# of Tweets added from this response:  491
Endpoint Response Code: 200
# of Tweets added from this response:  486
Endpoint Response Code: 200
# of Tweets added from this response:  486
Endpoint Response Code: 200
# of Tweets added from this response:  478
Endpoint Response Code: 200
# of Tweets added from this response:  486
Endpoint Response Code: 200
# of Tweets added from this response:  481
Endpoint Response Code: 200
# of Tweets added from this response:  475
Endpoi

<h2>References : </h2>

https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a

https://muhark.github.io/python/scraping/tutorial/2021/03/25/getting-started-with-academic-twitter.html

backup - https://towardsdatascience.com/searching-for-tweets-with-python-f659144b225f

bounding box - https://boundingbox.klokantech.com/