# Prepare data

Downloads most starred repos from GitHub using GraphQL API. 

In [None]:
# pip3 install ggl
# pip3 install requests_toolbelt

from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os

access_token = os.environ.get('GH_TOKEN', "")
transport = RequestsHTTPTransport(
    url="https://api.github.com/graphql",
    headers={"Authorization": f"Bearer {access_token}"} if access_token != "" else {},
)
client = Client(transport=transport, fetch_schema_from_transport=True)
year_and_a_half_ago = (datetime.now() - relativedelta(years=1, months=6)).strftime("%Y-%m-%dT%H:%M:%SZ")

def search_repos(endCursor=None, debug=False):
    after = (
        ""
        if endCursor is None
        else f'after: "{endCursor}",'
    )
    since = f'since: "{year_and_a_half_ago}"'
    query = (
        """
    {
      search(query: "stars:>1 sort:stars", type: REPOSITORY, """
        + after
        + """ first: 40) {
        pageInfo {
          endCursor
        }
        repositoryCount
        edges {
          node {
            ... on Repository {
              # Full name
              nameWithOwner
              url
              primaryLanguage {
                name
              }
              # The number of kilobytes this repository occupies on disk.
              diskUsage
              stargazers {
                totalCount
              }
              defaultBranchRef {
                name
                target {
                  ... on Commit {
                    history(""" + since + """) {
                      totalCount
                    }
                  }
                }
              }
              # Github GraphQL API currently don't support getting contributors of a repo.
              # It is not documented who is the mentionable user in repository context
              # But it is the closest value to contributor
              mentionableUsers {
                totalCount
              }
            }
          }
        }
      }
    }
    """
    )
    if debug:
        print(query)
    return client.execute(gql(query))


In [None]:
import pandas as pd

github_df = pd.DataFrame(
    columns=[
        "Repository",
        "Clone URL",
        "Stars",
        "Size",
        "Mentionable Users",
        "Commits",
        "Language",
    ]
)

endCursor = None
for page in range(0, 20):
    result = search_repos(endCursor)
    prevCursor = endCursor
    endCursor = result["search"]["pageInfo"]["endCursor"]
    print(page, endCursor)
    if endCursor is None:
        break

    for edge in result["search"]["edges"]:
        node = edge["node"]
        new_row = {
            "Repository": node["nameWithOwner"],
            "Clone URL": node["url"] + ".git",
            "Stars": node["stargazers"]["totalCount"],
            "Size": node["diskUsage"],
            "Mentionable Users": node["mentionableUsers"]["totalCount"],
            "Commits": node["defaultBranchRef"]["target"]["history"]["totalCount"],
            "Language": node["primaryLanguage"]["name"] if node["primaryLanguage"] is not None else "None",
            "Bus Factor": 0,
            "Time": 0,
        }
        github_df.loc[len(github_df)] = new_row

for column in ["Stars", "Size", "Mentionable Users", "Commits"]:
    github_df[column] = github_df[column].astype('int')
    
github_df.drop_duplicates(subset=['Repository'], inplace=True)
github_df.describe()


In [None]:
now = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
with open("dataset_" + now + ".csv", "w+") as output:
    output.write(github_df.to_csv())
