In [15]:
from experiments.github import get_github_token
from experiments.github import GitHubV3
from experiments.client import retry_requests
from experiments.logger import setup_logger
from experiments.code_search.listup_repos import has_next_link
from rich import print
import pandas as pd
import hashlib
import time
import datetime
import json

In [2]:
def create_query(keyword: str, language: str, size_min: int, size_max: int) -> str:
    query = f"{keyword} in:file language:{language} size:{size_min}..{size_max}"
    return query

In [3]:
dt_now = datetime.datetime.now().isoformat()
logger = setup_logger(__name__, logfile=f"log-{dt_now}.txt") 

In [4]:
gh = GitHubV3(token=get_github_token())
search_code = retry_requests(gh.search_code, logger=logger)

In [12]:
keyword = '"Here Be Dragons"'
language = "c"
# size_min = 0
# size_max = 1000000
size_min = 4000
size_max = 5000
prev_size_min = -1
prev_size_max = -1

In [13]:
query = create_query(keyword, language, size_min, size_max)
# query = f"{keyword} in:file language:{language} size:{size_min}..{size_max}"
query

'"Here Be Dragons" in:file language:c size:4000..5000'

In [16]:
loop = 0
while True:
    query = create_query(keyword, language, size_min, size_max)
    logger.debug(f"try: {query}")
    
    res = search_code(query, per_page=1, page=1, text_match=False)
    logger.debug("Request succeeded.")
    
    res_json = res.json()
    total_count = res_json.get("total_count")

    if total_count == 0:
        # これ以上大きいファイルは見つからないので終了する
        if prev_size_min == -1 or prev_size_max == -1:
            break
        # prev_size_min, prev_size_max では、1000個以下に絞れなかったが、
        # それ以上範囲を絞ってしまうと、0個になってしまう。
        # 仕方ないので、1000個を超えてしまうが、前回の範囲で検索してしまう
        size_min = prev_size_min
        size_max = prev_size_max
        logger.warning(f"WARN: total_count is over 1000.({prev_total_count})")
    elif total_count > 1000:
        # 1000個以上あった場合
        if size_min == size_max:
            # サイズの範囲を小さくしても、1000個以下に絞れなかったのでそのまま実行する
            logger.warning(f"WARN: total_count is over 1000.({total_count})")
        else:
            # サイズを小さくして、total_countを調べ直す
            prev_size_min, prev_size_max = size_min, size_max
            prev_total_count = total_count
            size_max = (size_min + size_max) // 2
            time.sleep(1)
            continue
    
    items = []
    for page in range(1,11):
        logger.debug(f"try: {query}, page: {page}")
        res = search_code(query, per_page=100, page=page, text_match=True)
        logger.debug("Request succeeded.")
        res_json = res.json()
        items.extend(res_json.get("items"))
        
        if not has_next_link(res.headers)
            break
        time.sleep(1)
        
    # ファイルへ出力する
    d = {"items": items}
    with open(f"Here_Be_Dragons_C/{loop}.json", "w") as f:
        json.dump(d, f, indent=4, ensure_ascii=False)
    logger.debug(f"json dump: {loop}.json")
    loop += 1
        
    if size_min == size_max:
        size_min = size_max + 1
    else:
        size_min = size_max
    
    size_max *= 10
    prev_size_min = -1
    prev_size_max = -1

KeyboardInterrupt: 