In [22]:
"""
https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
"""
import json
import logging
import os
from argparse import ArgumentParser, Namespace
from pathlib import Path
from typing import Any, Dict, List, cast
import pandas as pd
from tqdm.auto import tqdm

from searchtweets import ResultStream, gen_request_parameters, load_credentials

logger = logging.getLogger()
logging.basicConfig(
    level=os.environ.get("LOGLEVEL", "INFO"),
    format="[%(asctime)s]:[%(processName)-11s]" + "[%(levelname)-s]:[%(name)s] %(message)s",
)


# PAGE_SIZE, value between 10 and 100, is passed into the "max_results" parameter of pagination
# (https://developer.twitter.com/en/docs/twitter-api/pagination)
PAGE_SIZE = 100
# MAX_TWEETS is a parameter specific to the search_tweets python library. It caps how many tweets
# for the entire session, i.e,, across multiple pages:
MAX_TWEETS = 135000


def batchify(iterable, batch_size=1):
    """Splits an iterable / list-like into batches of size n"""
    l = len(iterable)
    for ndx in range(0, l, batch_size):
        yield iterable[ndx : min(ndx + batch_size, l)]


def rehydrate_tweets(args: Namespace, tweet_ids: List[str]) -> None:
    """
    Rehydrate tweets from tweet_ids to json with data about each tweet, including
    embedded images, videos, etc.
    """
    config_path = Path("./search_tweets.yaml").resolve()
    filename = (args.output_dir / f"{args.name}.json").resolve()
    assert config_path.exists(), str(config_path)
    assert not filename.exists(), str(filename)
    search_args = load_credentials(filename=config_path)
    # load_credentials() fails to load the "endpoint" from the search_tweets.yaml file,
    # so set it manually here:
    search_args["endpoint"] = "https://api.twitter.com/2/tweets/"
    logger.info(f"Search args: {search_args}")

    tweets_per_batch = 100
    for batch in batchify(tweet_ids, tweets_per_batch):
        # Build API Query
        query_str = get_search_phrase(batch)
        search_args["endpoint"] += query_str
        search_query = gen_request_parameters(
            "",
            results_per_call=PAGE_SIZE,
            media_fields="media_key,type,duration_ms,height,preview_image_url,public_metrics,url,width,alt_text",
            place_fields="full_name,id,country,country_code,geo,name,place_type",
            tweet_fields="attachments,author_id,context_annotations,created_at,entities,geo,id,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,source,text,withheld",
            user_fields="description,location,public_metrics",
            expansions="attachments.media_keys,author_id,geo.place_id",
            end_time="2021-08-16 00:00",
        )
        logger.info(f"Search query: {search_query}")

        # Get Tweets
        rs = ResultStream(
            request_parameters=search_query,
            max_tweets=MAX_TWEETS,
            max_pages=2,
            output_format="a",
            **search_args,
        )
        logger.info(f"ResultStream: {str(rs)}")
        logger.info("")
        for page_num, page in enumerate(rs.stream()):
            save_page(page, args, filename)


# def save_counts_page(page, args):
#     filename = (args.output_dir / f"{args.name}_counts.json").resolve()

#     with open(filename, "a") as f:
#         logger.info(type(page))
#         logger.info(page)
#         tweet_count = 0
#         for count_per_day in page["data"]:
#             tweet_count += count_per_day["tweet_count"]
#             f.write(json.dumps(count_per_day, sort_keys=True) + "\n")
#     return tweet_count


def save_page(page, args, filename: Path):
    with open(filename, "a") as f:
        # logger.info(f"keys: {page.keys()}")
        # for tweet in page["data"]:
        #     f.write(json.dumps(tweet, sort_keys=True) + "\n")
        f.write(json.dumps(page, sort_keys=True) + "\n")


def get_search_phrase(tweet_id_batch):
    return ",".join(tweet_id_batch)


def main(args):
    # Prepare paths:
    config_path = Path("./search_tweets.yaml").resolve()
    csv_path = args.csv_path.resolve()
    logger.info(f"config_path: {config_path}")
    logger.info(f"csv_path: {csv_path}")
    assert config_path.exists, str(config_path)
    assert csv_path.exists(), str(csv_path)
    df = pd.read_csv(
        csv_path,
        dtype={"tweet_id": int},
    )
    tweet_ids = df.tweet_id.astype(str).values.tolist()[:2000]
    print(type(tweet_ids[0]))
    logger.info(f"Total tweet_ids: {len(tweet_ids)}")
    rehydrate_tweets(args, tweet_ids)


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument(
        "--output_dir",
        type=Path,
        default=Path("../data/tweets").resolve(),
    )
    parser.add_argument(
        "--name",
        type=str,
        default="twitter_comms_dataset",
        help="Used as part of output file name.",
    )
    parser.add_argument(
        "--csv_path",
        type=Path,
        default="../data/tweets/twitter_comms_dataset.csv",
        help="Path to .csv file containing tweet_id's to rehydrate.",
    )
    args = parser.parse_args(args=[])
    main(args)

[2022-03-18 17:12:58,068]:[MainProcess][INFO]:[root] config_path: /home/gbiamby/proj/semafor/other_poj/twitter_comms/scripts/search_tweets.yaml
[2022-03-18 17:12:58,069]:[MainProcess][INFO]:[root] csv_path: /home/gbiamby/proj/semafor/other_poj/twitter_comms/data/tweets/twitter_comms_dataset.csv
[2022-03-18 17:12:58,889]:[MainProcess][INFO]:[root] Total tweet_ids: 2000
[2022-03-18 17:12:58,892]:[MainProcess][INFO]:[root] Search args: {'bearer_token': 'AAAAAAAAAAAAAAAAAAAAAMUySgEAAAAAlWTFzd4DKy5Q9C%2B%2FhPhRRaIEM60%3D8LbvjJ5Wzn3wl8q8tgkQOLqajE9ATyVFvWUHSmcEnmCxRd7QrJ', 'endpoint': 'https://api.twitter.com/2/tweets/', 'extra_headers_dict': None}
[2022-03-18 17:12:58,892]:[MainProcess][INFO]:[root] Search query: {"query":"1409530436481687559,1420581355176480770,1415615378546466819,1425871126014615558,1409480196944760833,1396357926990667784,1397088197054697473,1415037360706834438,1422531324997521408,1424781156554186757,1395738789868306434,1410109615732264966,1422331634947436544,142343266490

<class 'str'>


HTTPError: 