# Setup

In [None]:
import os
from pathlib import Path
import sys

# --- REPO ROOT ON sys.path (so `from src.*` works locally) ---
_REPO_ROOT = str(Path(os.getcwd()).resolve().parents[1])
if _REPO_ROOT not in sys.path:
    sys.path.insert(0, _REPO_ROOT)


# --- ENVIRONMENT SWITCH ---
# Set to True if running on local machine with Google Drive Desktop mounted
# Set to False if running in Google Colab cloud
RUNNING_LOCALLY = True

if RUNNING_LOCALLY:
    # Standard macOS path for Google Drive Desktop
    BASE_PATH = Path('/Volumes/GoogleDrive/MyDrive/AI Public Trust')
else:
    # Google Colab cloud path
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = Path('/content/drive/MyDrive/AI Public Trust')

# Pre-compute critical paths used across notebooks
twits_folder = BASE_PATH / 'Raw Data/Twits/'
test_folder = BASE_PATH / 'Raw Data/'
datasets_folder = BASE_PATH / 'Data Sets'
cleanedds_folder = BASE_PATH / 'Data Sets/Cleaned Data'
networks_folder = BASE_PATH / 'Data Sets/Networks/'
literature_folder = BASE_PATH / 'Literature/'
topic_models_folder = BASE_PATH / 'Models/Topic Modeling/'


In [None]:
import tweepy
#from tweepy import Stream
from datetime import timedelta
import json
import time
import datetime
import os
from datetime import timedelta
import tqdm
import pickle

path = os.getcwd()
# parent directory
parent = os.path.dirname(path)
print("Parent directory", parent)
source_folder = parent+'/Data/'
twits_folder=source_folder+'Twits/'

Parent directory /Users/ignacioojea/Documents/Research/AI Public Trust Twitter


In [None]:
#BearerToken = 'AAAAAAAAAAAAAAAAAAAAALrWiAEAAAAAaNJvkshDSGyCxM2Ln%2BlbbgkXMJU%3DfVbisbZLd2JuZaeguCBzcjuU2nLtdGrqPHM9YQ5zqXrmes1gfk'
BearerToken = 'AAAAAAAAAAAAAAAAAAAAAGrpoQEAAAAA4Uk6TBq88nyUIQoF%2B70Osv%2Fmlnc%3Dk0Y0Se1NEWbOG0xWVEuEv8eIIf2mYxlN56WnC0xRuwthQS4AkQ'

### Query notes

- Tweet field list: https://developer.twitter.com/en/docs/twitter-api/fields
- Query logic: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
- Time converter: https://www.timestamp-converter.com/
- AI TIMELINE: https://lifearchitect.ai/timeline/
- https://docs.tweepy.org/en/stable/examples.html
- https://docs.tweepy.org/en/stable/expansions_and_fields.html

### Helpful data:
- https://twittercommunity.com/t/client-search-all-tweets-returns-only-default-fields/179008/4
- https://twittercommunity.com/top?period=weekly

### Mas notas al final

In [None]:
query = '(ChatGPT OR Chat-GPT OR GPT OR GPT-3 OR GPT3 OR GPT-4 OR GPT4 OR BARD OR (Bing AI) OR LLMs OR LLM OR AI OR AGI OR (artificial intelligence) OR (large language models) OR LaMDA OR PaLM OR Med-PaLM OR BERT OR LLaMA) lang:en'
start_time = datetime.datetime(2022,11,15,0,0,0)
# 10 mins windows
step_size = 600
step_time = start_time+timedelta(seconds=step_size)
limit=2000

## Trying it out

In [None]:
tryit = True

In [None]:
if tryit:# Client with bearer token
    client = tweepy.Client(bearer_token=BearerToken,
                            return_type = dict,
                           #return_type='response',
                            wait_on_rate_limit=False)


    paginator = tweepy.Paginator(client.search_all_tweets,
                                        query=query,
                                        # https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/tweet
                                        tweet_fields=['id','entities', 'created_at',
                                                    'author_id','text',
                                                    'public_metrics',
                                                    'possibly_sensitive',
                                                    'conversation_id',
                                                    'referenced_tweets',
                                                    'lang',
                                                    #'non_public_metrics',
                                                    #'organic_metrics',
                                                    #'promoted_metrics',
                                                   ],
                                      user_fields=['id','name','username','created_at','public_metrics','verified',
                                                   'description','entities','location'],
                                  expansions=['author_id','referenced_tweets.id','referenced_tweets.id.author_id'],
                                        start_time=start_time,
                                        end_time=step_time,
                                        max_results=500,
                                        limit=limit)

https://github.com/tweepy/tweepy/pull/1861
https://github.com/tweepy/tweepy/issues/1843

In [None]:
if tryit:
    print('The paginator object is of type: '+ str(type(paginator)))

    print('---------------')
    print('Lets briefly study the objects we have at hand:')
    for page in paginator:
        print('--------------- page object type ------------')
        print(type(page))
        rare_object = page
        #print(page.data)                       # The tweets are here
        #print(page.meta)
        #print(page.includes)                   # The includes are here
        print('--------------- exploring the data ------------')
        data = page['data']
        print(type(data))
        print(len(data))
        print(type(data[0]))
        print(data[0]['text'])
        print(data[0]['created_at'])
        print('--------------- exploring the extensions ------------')
        includes = page['includes']
        print(type(includes))
        print(includes.keys())
        print(type(includes['tweets']))
        print(len(includes['tweets']))
        print(len(includes['users']))
        print('--------------- exploring the meta ------------')
        meta = page['meta']
        print(meta)
        break


The paginator object is of type: <class 'tweepy.pagination.Paginator'>
---------------
Lets briefly study the objects we have at hand:


Forbidden: 403 Forbidden
When authenticating requests to the Twitter API v2 endpoints, you must use keys and tokens from a Twitter developer App that is attached to a Project. You can create a project via the developer portal.

In [None]:
if tryit:
    file_name = 'testing.json'
    dict_list = []
    out_file = open(source_folder+file_name, "w",encoding='utf-8')

    for page in paginator:
        dict_list.append(page)
    json.dump(dict_list, out_file, indent = 1)

    out_file.close()

In [None]:
if tryit:
    f = open(source_folder+"testing.json",'r',encoding='utf-8')
    test = json.load(f)
    f.close()
    print('Remember we set the requests limit to '+str(limit)+'.')
    print('While the total requests found were of '+str(len(test))+'.')
    print('Also we set the max per request at 500, and the data has more or less '+str(len(test[0]['data']))+' twits per request'+'.')

In [None]:
if tryit:
    index=0
    for twit in test[0]['data']:
        #index+=1
        try:
            type_of_ref=twit['referenced_tweets'][0]['type']
            #if type_of_ref=='replied_to':
            print(type_of_ref)
            #print(twit['referenced_tweets'][0]['id'])
            #print(test[0]['includes']['tweets'][index]['id'])
            print(test[0]['includes']['tweets'][index]['id']==twit['referenced_tweets'][0]['id'])
            index+=1
        except:
            continue

In [None]:
if tryit:
    count = 0
    for i in range(len(test)):
        for j in range(len(test[i]['data'])):
            count+=1
            print(test[i]['data'][j]['text'])
            print('-----')
            if count>5:
                break

# Now the Actual Job

## Time windows

Following this timeline: https://lifearchitect.ai/timeline/

ChatGPT was released November 30th 2022. We start the scraping on November 15.

## Define the proper function

In [None]:
query = '(ChatGPT OR Chat-GPT OR GPT OR GPT-3 OR GPT3 OR GPT-4 OR GPT4 OR BARD OR (Bing AI) OR LLMs OR LLM OR AI OR AGI OR (artificial intelligence) OR (large language models) OR LaMDA OR PaLM OR Med-PaLM OR BERT OR LLaMA) lang:en'
limit=2000

In [None]:
# Client with bearer token
# https://docs.tweepy.org/en/stable/client.html
client = tweepy.Client(bearer_token=BearerToken,
                        return_type = dict,
                        wait_on_rate_limit=False)

In [None]:
def get_tweets(start_time,end_time,file_name,query=query,exec_time = False,limit=100):
    st = time.time()
    #print(str(datetime.datetime.now()))
    dict_list = []
    out_file = open(file_name, "w",encoding='utf-8')
    experiment = True

    try:
        page_count=0
        for page in tweepy.Paginator(client.search_all_tweets,
                        query=query,
                        # https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/tweet
                        tweet_fields=['id','entities', 'created_at',
                                    'author_id','text',
                                    'public_metrics',
                                    'possibly_sensitive',
                                    'conversation_id',
                                    'referenced_tweets',
                                    'lang',
                                               ],
                        user_fields=['id','name','username','created_at','public_metrics','verified',
                                    'description','entities','location'],
                        expansions=['author_id','referenced_tweets.id','referenced_tweets.id.author_id'],
                                    start_time=start_time,
                                    end_time=end_time,
                                    max_results=500,
                                    limit=limit):
            time.sleep(0.75)
            #print(str(datetime.datetime.now()))
            page_count+=1
            dict_list.append(page)
        print('Total amount of pages: '+str(page_count))
        json.dump(dict_list, out_file, indent = 1)
    except Exception as e:
        print(e)
        #print('Maxed reached (probably). Waiting a few seconds and retrying...')
        experiment=False
        time.sleep(901)


    out_file.close()

    if exec_time:
        et = time.time()
        elapsed_time = et - st
        print('Execution time:', elapsed_time, 'seconds')

    return experiment

## Create Windows

In [None]:
# Create windows
start_time = datetime.datetime(2022,11,15,0,0,0)
# Para empezar hagamos un mes
end_time = datetime.datetime(2023,2,27,12,0,0)
step_size = 1800

step_time = start_time+timedelta(seconds=step_size)
prior_time = start_time
windows = []
while step_time<=end_time:
    windows.append([prior_time,step_time])
    prior_time = step_time
    step_time = step_time+timedelta(seconds=step_size)

windows[-1]
#datetime.datetime.strftime(windows[0][0],"%Y-%m-%dT%H:%M:%S.%fZ")

## Test the function

In [None]:
file_name = source_folder+'tweets_test.json'
tryit=False
if tryit:
    start_time,end_time = windows[0][0],windows[0][1]
    experiment=get_tweets(start_time,end_time,file_name,exec_time=False, limit=limit)


In [None]:
f = open(file_name,'r',encoding='utf-8')
test = json.load(f)
f.close()
print('Remember we set the requests limit to '+str(2000)+'.')
print('While the total requests found were of '+str(len(test))+'.')
print('Also we set the max per request at 500, and the data has more or less '+str(len(test[0]['data']))+' twits per request'+'.')

## Doing the job

In [None]:
# Load processed windows
with open('processed_windows_list', 'rb') as fp:
    processed_windows = pickle.load(fp)
processed_windows[-1]

In [None]:
for i in range(len(windows)):
    if windows[i] not in processed_windows:
        print(i)
        print(windows[i])

In [None]:
# Get data
st = time.time()

attempts = 0
successes = 0
failures = 0
for i in range(len(windows)):
    if windows[i] not in processed_windows:
        start_time,end_time = windows[i][0],windows[i][1]
        left_window = datetime.datetime.strftime(windows[i][0],"%Y-%m-%dT%H:%M:%S")
        right_window= datetime.datetime.strftime(windows[i][1],"%Y-%m-%dT%H:%M:%S")
        file_name = twits_folder+'tweets_'+left_window+'.json'
        experiment = False
        while not experiment:
            attempts+=1
            experiment=get_tweets(start_time,end_time,file_name,exec_time=False, limit=limit)
            print(windows[i])
            if experiment == False:
                failures +=1
                print(failures)
            if failures > 25:
                break
        successes +=1

        processed_windows.append([start_time,end_time])
        with open('processed_windows_list', 'wb') as fp:
            pickle.dump(processed_windows, fp)

        if failures > 25:
            print('This is the last window that worked:')
            print(windows[i-1])
            break

print(successes/attempts)

et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time/60, 'minutes')

In [None]:
failures

In [None]:
with open('processed_windows_list', 'rb') as fp:
    processed_windows = pickle.load(fp)
processed_windows[-1]

## Notas

#### Return type = 'response'

- https://docs.tweepy.org/en/stable/response.html#tweepy.Response

Type of object: collections.namedtuple

- https://docs.python.org/3/library/collections.html#collections.namedtuple
- https://www.geeksforgeeks.org/namedtuple-in-python/

### IMPORTANTE: Es un problema con el flatten! (por eso no uso flatten ahora)

- https://docs.tweepy.org/en/stable/faq.html

Ahi dice:

How do I access includes data while using Paginator?
Paginator.flatten() flattens the data and iterates over each object.

To access includes, you’ll need to iterate through each response instead.


- https://stackoverflow.com/questions/73196236/tweepy-error-using-paginator-to-extract-media-data
- https://www.firebolt.io/glossary-items/data-flattening-and-data-unflattening