In [1]:
from gql import gql, Client
from gql.transport.aiohttp import AIOHTTPTransport
import pandas as pd

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import json_tuple,from_json,get_json_object

# Initialize spark
sc = SparkContext('local')
spark = SparkSession(sc)

In [2]:
async def fetch_github_all_repos(cursor=None):
    transport = AIOHTTPTransport(url="https://api.github.com/graphql", headers={"authorization":"Bearer <user key>"})

    # Using `async with` on the client will start a connection on the transport
    # and provide a `session` variable to execute queries on this connection
    
    cursorSTR = ""
    if cursor :
        cursorSTR = 'after:"{0}", '.format(cursor)
    
    async with Client(
        transport=transport, fetch_schema_from_transport=True,
    ) as session:
        # Execute single query
        
        
        query = gql(
            """
            query {
              search(query: "Python", %s type: REPOSITORY,  first: 100) { 
                repositoryCount
                pageInfo {
                  startCursor
                  hasNextPage
                  endCursor
                }
                edges {
                  node {
                    ... on Repository {
                      name
                      descriptionHTML
                    }
                  }
                }
              }
            }
        """ % (cursorSTR)
        )

        result = await session.execute(query)
        return result

In [4]:
# Main procedure to retreive everything

current_page = 0
currentToken = None
hasNextPage = True
fetched = 0
frames = pd.DataFrame()
    
# get all pages until there are none left
while (hasNextPage) and (fetched < 10000):
    

    response = await fetch_github_all_repos(currentToken) # "Y3Vyc29yOjIwMA=="

    # MetaData for the processing
    pageInfo = response['search']['pageInfo']
    hasNextPage = pageInfo['hasNextPage'] == True # Because this comes as a string
    currentToken = pageInfo['endCursor']
    
    # Fetching the info
    edges = pd.json_normalize(response['search']['edges'])
    retrieved = edges.shape[0]
    fetched += retrieved
    frames = frames.append(edges)
    current_page += 1
    
    #Logging
    if current_page == 1: # Just the first time
        print( "There are " + str(response['search']['repositoryCount']) + " repos matching that contition." )
    
    print( "New page fetched! token: " + currentToken + ", cuerrent page: " + str(current_page) + ", with " + str(retrieved) + " new recods retrieved successfully."  )
    
print( "Process ended! " )
print( str(frames.shape[0]) + " records fetched.")
frames

There are 1723006 repos matching that contition.
New page fetched! token: Y3Vyc29yOjEwMA==, cuerrent page: 1, with 100 new recods retrieved successfully.
New page fetched! token: Y3Vyc29yOjIwMA==, cuerrent page: 2, with 100 new recods retrieved successfully.
New page fetched! token: Y3Vyc29yOjMwMA==, cuerrent page: 3, with 100 new recods retrieved successfully.
New page fetched! token: Y3Vyc29yOjQwMA==, cuerrent page: 4, with 100 new recods retrieved successfully.
New page fetched! token: Y3Vyc29yOjUwMA==, cuerrent page: 5, with 100 new recods retrieved successfully.
New page fetched! token: Y3Vyc29yOjYwMA==, cuerrent page: 6, with 100 new recods retrieved successfully.
New page fetched! token: Y3Vyc29yOjcwMA==, cuerrent page: 7, with 100 new recods retrieved successfully.
New page fetched! token: Y3Vyc29yOjgwMA==, cuerrent page: 8, with 100 new recods retrieved successfully.
New page fetched! token: Y3Vyc29yOjkwMA==, cuerrent page: 9, with 100 new recods retrieved successfully.
New pa

Unnamed: 0,node.name,node.descriptionHTML
0,Python,<div>All Algorithms implemented in Python</div>
1,Python,<div>My Python Examples</div>
2,Python,<div>Python脚本。模拟登录知乎， 爬虫，操作excel，微信公众号，远程开机</div>
3,Python,<div>最良心的 Python 教程：</div>
4,python,<div>Show Me the Code Python version.</div>
...,...,...
95,Coursera-Machine-Learning,<div>Coursera Machine Learning - Python code</...
96,opencv-python-blueprints,<div>M. Beyeler (2015). OpenCV with Python Blu...
97,python-django-learning,"<div>\n<g-emoji class=""g-emoji"" alias=""beer"" f..."
98,fuckitpy,<div>The Python error steamroller.</div>
