In [None]:
import requests
import time
from datetime import datetime
from IPython.display import JSON
import pandas as pd
import pickle
import configparser as cp

In [None]:
"""
    Reads the config file and returns the config object
"""
config = cp.ConfigParser()
config.read('config.ini')

In [None]:
rate_limit = requests.get(
    config['API']['URL'] +
    config['API']['RATE_LIMIT'] ,
    auth=(
        config['API']['USER'],    
        config['API']['KEY']
    )   
).json()["resources"]["search"] 

json =  requests.get( 
        config['API']['URL'] +
        config['API']['REPOS'],
        auth=(
            config['API']['USER'],
            config['API']['KEY']
        )
    ).json()


## Splitting up the Queries

Due to Githubs limitation on the number of 1000 items returned per query\[1\] we need to create queries which get less than 1000 items, but still cover the entirety of the dataset.

Previous attempts\[2\] to solve this exact problem constrained their queries by the amount of stars for each repository.
A method, which only works as long a there are less than 1000 repositories with the same amount of stars.

This was then mitigated by using the creation date of the repository as a second constraint.
As described in their corresponding blog article \[3\], this solution works by:

* First querying the Github Graphql API to see the result count of how many items a given query would provide
* If it is above a count of 1000 results the takes the date of jungest and oldest repository and splits the query in half of the time range
* Then the size of these two queries is checked again and if they are still above 1000 results the process is repeated until the size of the queries is below 1000 results

In [None]:
# Simple function to convert a Unix timestamp to a string in the format required by the github api
to_string = lambda stamp : datetime.fromtimestamp(stamp).strftime('%Y-%m-%dT%H:%M:%SZ')

def split_querys(start, end):
  global amount_of_repos
  global repos_done

  r = requests.post(
          'https://api.github.com/graphql',
          headers={'Authorization': 'bearer '+ config['API']['KEY']},
          json={"query": count_query % (to_string(start), to_string(end)),
          "variables":{}}
      )
  
  # On the first run we get the total number of repos 
  # This is used to calculate the progress of the script
  if (amount_of_repos is None):
    amount_of_repos = r.json()["data"]["search"]["repositoryCount"]
    repos_done = 0

  # If we are close to the rate limit we sleep until the rate limit resets
  if r.json()["data"]["rateLimit"]["remaining"] < 10:
    reset_time = datetime.strptime( r.json()["data"]["rateLimit"]["resetAt"], '%Y-%m-%dT%H:%M:%SZ')
    
    while datetime.now() < reset_time:
      seconds_till_reset = (reset_time - datetime.now()).total_seconds()
      print ("Sleeping till %s... %d minutes and %d seconds left..." % ( reset_time, *divmod(seconds_till_reset, 60)))
      time.sleep(5)

  # If the number of repos in the repos in the time range is greater than 1000
  if r.json()["data"]["search"]["repositoryCount"] > 1000:
    # We split the range in half and do the same query on each half
    # This will continue recursively until the number of repos is less than 1000
    split_querys(start, (start + end)//2)
    split_querys((start + end)//2, end) 
    
  else:
    # If we finnaly get a range with less than 1000 repos we add the timestamps to the sections list
    sections.append((start, end))
    repos_done = repos_done+r.json()["data"]["search"]["repositoryCount"]
    print(f"Working on {to_string(start)} to {to_string(end)}. Progress: {repos_done/amount_of_repos*100:.2f}%")
# The query to get the number of repos in a given time range as well as the current state of the rate limit
count_query = ''' query { 
                   rateLimit {
                    cost
                    remaining
                    resetAt
                  }
                  search(
                    query:"is:public, stars:>15, created:%s..%s"
                    type: REPOSITORY, first: 1) {
                    repositoryCount
                  }
                } '''

sections = []

start = 1167609600 # Timestamp for 2007-01-01 (Github was founded in 2008 so this will cover all repos)
end = 1678209714  # Current Time stamp (for consistency will not use time.time()

amount_of_repos = None

split_querys(start, end)

## Bibliography

[1] “Resources in the REST API,” GitHub Docs. https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28 (accessed Mar. 07, 2023).

[2] danvk, “How can I get a list of all public GitHub repos with more than 20 stars?,” Stack Overflow, Feb. 02, 2020. https://stackoverflow.com/q/60022429 (accessed Mar. 07, 2023).


[3] D. Vanderkam, “GitHub Stars and the h-index: A Journey,” Medium, Feb. 10, 2020. https://danvdk.medium.com/github-stars-and-the-h-index-a-journey-c104cfe37da6 (accessed Mar. 06, 2023).