# Get old tweets using the academic research endpoint through Twitter's API

Get large trunck of tweets through Twitter's API: all \#china tweets posted during the year of 2020 and 2021 quarter by quarter (excl. retweets/quotes/reply).

_Author: Jinghua Xu_

_reference: [https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a](https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a)_

_[Twitter developer portal](https://developer.twitter.com/en/portal/dashboard)_

In [10]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time

In [11]:
os.environ['TOKEN'] = 'AAAAAAAAAAAAAAAAAAAAAIE%2BZgEAAAAAf5nHOqUwHaylmZeOihu%2FIqfrsL8%3DEKzW6lM4XxrzGEW3kvS5AL5l0esZvTX6fC5ACEW9Y85xRZ2jDD'

In [12]:
def auth():
    return os.getenv('TOKEN')

In [13]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [14]:

def create_url(keyword, start_date, end_date):
    
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': 500,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

In [15]:
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [16]:
#Inputs for the request
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = "#china lang:en -is:retweet -is:reply -is:quote"
start_time = "2019-01-01T00:00:00.000Z"
end_time = "2020-01-01T00:00:00.000Z"

In [17]:
def append_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # # 3. Geolocation
        # if ('geo' in tweet):   
        #     if 'place_id' in tweet['geo']:
        #       geo = tweet['geo']['place_id']
        #     else:
        #       geo = " "
        # else:
        #     geo = " "

        # 4. Tweet ID
        tweet_id = tweet['id']

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 7. source
        # source = tweet['source']

        if 'source' in tweet.keys():
          source = tweet['source']
        else:
          source = 'unk'

        # 8. Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
        res = [author_id, created_at, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

In [18]:
#Total number of tweets we collected from the loop
total_tweets = 0

# output file name
fn = "2019.csv"

# fn = '2022_last_day.csv'

# Create file
csvFile = open(fn, "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
csvWriter.writerow(['author id', 'created_at', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
csvFile.close()

# Inputs
count = 0 # Counting tweets per time period
# max_count = 100 # Max tweets per time period
flag = True
next_token = None
max_results = 500

# Check if flag is true
while flag:
    print("-------------------")
    print("Token: ", next_token)
    url = create_url(keyword, start_time,end_time)
    json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
    result_count = json_response['meta']['result_count']

    if 'next_token' in json_response['meta']:
        # Save the token to use for next call
        next_token = json_response['meta']['next_token']
        print("Next Token: ", next_token)
        if result_count is not None and result_count > 0 and next_token is not None:
            print("Start Date: ", start_time)
            append_to_csv(json_response, fn)
            count += result_count
            total_tweets += result_count
            print("Total # of Tweets added: ", total_tweets)
            print("-------------------")
            time.sleep(5)                
    # If no next token exists
    else:
        if result_count is not None and result_count > 0:
            print("-------------------")
            print("Start Date: ", start_time)
            append_to_csv(json_response, fn)
            count += result_count
            total_tweets += result_count
            print("Total # of Tweets added: ", total_tweets)
            print("-------------------")
            time.sleep(5)
        
        #Since this is the final request, turn flag to false to move to the next time period.
        flag = False
        next_token = None
    time.sleep(5)
print("Total number of results: ", total_tweets)

-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fo6yevg082xkgb2tunix9exhh23ukd
Start Date:  2019-01-01T00:00:00.000Z
# of Tweets added from this response:  477
Total # of Tweets added:  477
-------------------
-------------------
Token:  b26v89c19zqg8o3fo6yevg082xkgb2tunix9exhh23ukd
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fo6yegtlf79eokg0lfdaxdnylxh08t
Start Date:  2019-01-01T00:00:00.000Z
# of Tweets added from this response:  486
Total # of Tweets added:  963
-------------------
-------------------
Token:  b26v89c19zqg8o3fo6yegtlf79eokg0lfdaxdnylxh08t
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fo6yegrhadoogry2oepuvnzo87ohvh
Start Date:  2019-01-01T00:00:00.000Z
# of Tweets added from this response:  477
Total # of Tweets added:  1440
-------------------
-------------------
Token:  b26v89c19zqg8o3fo6yegrhadoogry2oepuvnzo87ohvh
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fo6yegpck6uiqez4cqizta43bpeef1
Sta

Exception: ignored

## Data Overview

In [19]:
df = pd.read_csv(fn)
df

Unnamed: 0,author id,created_at,id,lang,like_count,quote_count,reply_count,retweet_count,source,tweet
0,3246529033,2019-12-31 23:59:58+00:00,1212161506277646336,en,0,0,0,0,unk,With #islamist terrorists one of our biggest t...
1,1116168151048146944,2019-12-31 23:59:36+00:00,1212161412048605191,en,0,0,0,0,unk,A member of the Early Rain Church in #China fa...
2,973925072841867264,2019-12-31 23:59:29+00:00,1212161385729269760,en,3,0,0,0,unk,h a p p i n e s s 🕊\n.\n.\n.\n#China @china ht...
3,29097819,2019-12-31 23:55:02+00:00,1212160265535528961,en,52,12,8,72,unk,A shadowy new alliance led by pro-#China #Comm...
4,968210551582199808,2019-12-31 23:53:35+00:00,1212159897619423232,en,0,0,0,0,unk,".@PayPal completes #GoPay acquisition, allowin..."
...,...,...,...,...,...,...,...,...,...,...
24623,58836985,2019-12-18 13:09:47+00:00,1207286839708663808,en,0,0,0,0,unk,The latest The Plass Appliance Home Daily! htt...
24624,1041452573323157504,2019-12-18 13:09:38+00:00,1207286800177410050,en,0,0,0,0,unk,Chinese Crypto Fitness App Reportedly Under In...
24625,917672895987552259,2019-12-18 13:07:19+00:00,1207286216699338752,en,0,0,1,1,unk,🇨🇳 #ChinaTrends #4: our fourth quarterly publi...
24626,960129987998244865,2019-12-18 13:07:01+00:00,1207286142288068609,en,0,0,0,0,unk,World markets were lifted in the first two day...


In [None]:
len(df)

In [None]:
# from google.colab import files
# files.download(fn)

In [None]:
# # connect to drive
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# # save to g drive
# !cp -r '/content/2022_Q4.csv' '/content/drive/MyDrive/#CHINA HATE'

In [None]:
# # save to g drive
# ! cp -r '/content/2019.csv' '/content/drive/MyDrive/CHINA_HATE'