In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("top").getOrCreate()
sc = spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/04 17:16:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/12/04 17:16:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [73]:
import requests

# insetr access token here
token = ""
headers = {"Authorization": f"token {token}"}

url = "https://api.github.com/organizations"
orgs_num = 5
params = {"per_page": orgs_num}

response = requests.get(url, params=params, headers=headers)
orgs_rdd = sc.parallelize(response.json())
orgs_df = spark.createDataFrame(orgs_rdd)
orgs_df.printSchema()

root
 |-- avatar_url: string (nullable = true)
 |-- description: string (nullable = true)
 |-- events_url: string (nullable = true)
 |-- hooks_url: string (nullable = true)
 |-- id: long (nullable = true)
 |-- issues_url: string (nullable = true)
 |-- login: string (nullable = true)
 |-- members_url: string (nullable = true)
 |-- node_id: string (nullable = true)
 |-- public_members_url: string (nullable = true)
 |-- repos_url: string (nullable = true)
 |-- url: string (nullable = true)



In [114]:
def mapping_repo(repo: dict) -> dict:
    return {
        "id": repo["id"],
        "org_name": repo["owner"]["login"],
        "repo_name": repo["name"],
        "stars_count": repo["stargazers_count"],
    }


def get_all_repositories_of_organization(url: str) -> list[dict]:
    global headers
    params = {"page": 1}
    repos_data: list[dict] = list()
    response = requests.get(url, headers=headers).json()
    while response:
        repos_data.extend(map(mapping_repo, response))
        
        params["page"] += 1
        response = requests.get(url, params=params, headers=headers).json()
    
    return repos_data

In [69]:
# sample
for row in orgs_df.select("repos_url").collect():
    print(row)
    print(row["repos_url"])
    break

Row(repos_url='https://api.github.com/orgs/errfree/repos')
https://api.github.com/orgs/errfree/repos


In [116]:
repos_data = []
for row in orgs_df.select("repos_url").collect():
    repos_data.extend(get_all_repositories_of_organization(row["repos_url"]))

repos_rdd = sc.parallelize(repos_data)
repos_df = spark.createDataFrame(repos_rdd)
repos_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- org_name: string (nullable = true)
 |-- repo_name: string (nullable = true)
 |-- stars_count: long (nullable = true)



In [117]:
repos_df.show()

+-------+----------+------------------+-----------+
|     id|  org_name|         repo_name|stars_count|
+-------+----------+------------------+-----------+
| 789949|   errfree|             test1|          1|
|2277888|   errfree|              test|          1|
|    273|engineyard|             eycap|        117|
|  19448|engineyard|           mongrel|         44|
|  21933|engineyard|      vertebra-erl|         34|
|  22381|engineyard|       vertebra-rb|         59|
|  22382|engineyard|           xmpp4em|         13|
|  23375|engineyard|      vertebra-gem|         45|
|  26548|engineyard|      vertebra-xen|         47|
|  39764|engineyard|puppet-daemontools|          6|
|  63084|engineyard|          vertebra|        161|
|  69864|engineyard|            sequel|          2|
|  75517|engineyard|            natter|         76|
|  81020|engineyard|      vertebra-ref|         43|
|  84100|engineyard|   rails-2.2.2-app|         10|
| 116800|engineyard|         loudmouth|         20|
| 116801|eng

In [122]:
sorted_repos_df = repos_df.sort("stars_count", ascending=False)
sorted_repos_df.show()

+--------+--------------+--------------------+-----------+
|      id|      org_name|           repo_name|stars_count|
+--------+--------------+--------------------+-----------+
|   52287|collectiveidea|         delayed_job|       4667|
|     363|collectiveidea|             audited|       2962|
|11670330|collectiveidea|          interactor|       2946|
|     912|collectiveidea|  awesome_nested_set|       2254|
|  133715|    engineyard|    ey-cloud-recipes|       1002|
| 2012691|collectiveidea|           json_spec|        916|
|12173225|collectiveidea|    interactor-rails|        394|
| 1801464|collectiveidea|delayed_job_activ...|        318|
|     374|collectiveidea|           graticule|        300|
|     376|collectiveidea|              tinder|        259|
|42527001|collectiveidea|  inside_the_machine|        258|
|  372208|    engineyard|       rails_metrics|        257|
|     364|collectiveidea|  acts_as_geocodable|        205|
|  889878|collectiveidea| delayed_job_mongoid|        17

In [131]:
top_num = 20
top_repos = sorted_repos_df.take(top_num)
top_repos

[Row(id=52287, org_name='collectiveidea', repo_name='delayed_job', stars_count=4667),
 Row(id=363, org_name='collectiveidea', repo_name='audited', stars_count=2962),
 Row(id=11670330, org_name='collectiveidea', repo_name='interactor', stars_count=2946),
 Row(id=912, org_name='collectiveidea', repo_name='awesome_nested_set', stars_count=2254),
 Row(id=133715, org_name='engineyard', repo_name='ey-cloud-recipes', stars_count=1002),
 Row(id=2012691, org_name='collectiveidea', repo_name='json_spec', stars_count=916),
 Row(id=12173225, org_name='collectiveidea', repo_name='interactor-rails', stars_count=394),
 Row(id=1801464, org_name='collectiveidea', repo_name='delayed_job_active_record', stars_count=318),
 Row(id=374, org_name='collectiveidea', repo_name='graticule', stars_count=300),
 Row(id=376, org_name='collectiveidea', repo_name='tinder', stars_count=259),
 Row(id=42527001, org_name='collectiveidea', repo_name='inside_the_machine', stars_count=258),
 Row(id=372208, org_name='engineya