# 02-01 : Scrape API Connection

Test scraping writter data using [Scrape API](scraperapi.com).

In [1]:
import os
from time import sleep
from datetime import datetime, timedelta
import json
import pandas as pd
import requests
from typing import List, Dict, Any
from pprint import pprint
from tqdm.notebook import tqdm

In [2]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

## Scrape Functions

In [3]:
def scrape_twitter(query:str, 
                   num:int, 
                   date_range_start:datetime,
                   date_range_end:datetime) -> Dict:
    """Scrape Twitter for tweets matching query.

    Args:
        query: Query to search for.
        num: Number of tweets to return.

    Returns:
        Dict: Dictionary of tweets.
    """
    scrape_url = 'https://api.scraperapi.com/structured/twitter/search'

    # set the parameters
    params = {
        'api_key': os.getenv('SCRAPE_API_KEY'),
        'query': query,
        'num': num,
        date_range_start: date_range_start.strftime('%Y-%m-%d'),
        date_range_end: date_range_end.strftime('%Y-%m-%d')
    }

    # make the request
    response = requests.get(scrape_url, params=params)

    # return the response
    return response.json()

# test the api
#response = scrape_twitter('vodacom tobi', 2, datetime(2023, 1, 1), datetime(2023, 1, 2))
#pprint(response)

In [4]:
def scrape_tweets(query:str, 
                   start_date:datetime,
                   end_date:datetime,
                   output_path:str,
                   retry_count=3) -> None:
    
    # Loop backwards one day at a time
    date_list = []
    current_date = start_date
    while current_date >= end_date:
        date_list.append(current_date)
        current_date -= timedelta(days=1)

    # scrape the tweets
    for current_date in tqdm(date_list):
        retries = 0
        while True:
            try:        
                tweets = scrape_twitter(
                    query=query,
                    num=1000,
                    date_range_start=current_date,
                    date_range_end=current_date + timedelta(days=1)),

                # save the results as a json file
                date_str = current_date.strftime('%Y-%m-%d')
                with open(f'{output_path}/02-01_{date_str}.json', 'w') as f:
                    json.dump(tweets, f)

                break
            except Exception as e:
                print('.', end='')
                retries += 1
                sleep(retries * 2)
                if retries > retry_count:
                    print(e)
                    break

# # test the function
# query = '( (vodacom OR #vodacom OR @vodacom) AND (tobi OR #tobi or @tobi) )'
# scrape_tweets(query='vodacom tobi', 
#               start_date=datetime(2023, 2, 12), 
#               end_date=datetime(2023, 2, 11),
#               output_path='../../data/raw/twitter')

## Scrape Tweets

In [5]:
scrape_tweets(query='vodacom tobi', 
              start_date=datetime(2023, 7, 31), 
              end_date=datetime(2021, 1, 1),c
              output_path='../../data/raw/twitter')

  0%|          | 0/942 [00:00<?, ?it/s]

......Expecting value: line 1 column 1 (char 0)
....Expecting value: line 1 column 1 (char 0)
..

KeyboardInterrupt: 