# Imports

In [1]:
import time
from pytz import UTC
import requests
from urllib.parse import quote
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
import itertools
import sys
import concurrent.futures
from dotenv import load_dotenv
import os

load_dotenv()

True

# Step - I: Fetching Users 
**(>100 followers from Singapore)**

In [2]:
key = os.getenv("my_token")
header = {"Authorization": f"Bearer {key}"}

In [3]:
q = quote("followers:>100 location:Singapore")
url = f"https://api.github.com/search/users?q={q}&sort=followers"
l = []
while url:
    r = requests.get(url, headers=header)
    url = r.links.get('next', {}).get('url')
    l.append(r.json())
    if (r.headers["X-RateLimit-Remaining"]) == 1:
        time.sleep(float(r.headers["X-RateLimit-Reset"])-current_time.timestamp())
    sys.stdout.write(f"{r.headers["X-RateLimit-Remaining"]},")

29,28,27,26,25,24,23,22,21,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,

In [4]:
l_items = [i["items"] for i in l]

In [5]:
l_logins = []
for i in range(len(l_items)):
    for j in l_items[i]:
        l_logins.append(j["login"])
len(l_logins)

698

# Step - II Fetching User Data 

In [6]:
def fetch_user_data(login):
    with requests.Session() as session:
        url = f"https://api.github.com/users/{login}"
        response = session.get(url, headers=header, timeout=10)
        response.raise_for_status()
        time.sleep(2)
        return response.json()

def fetch_all_users_data(logins):
    all_data = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(fetch_user_data, login): login for login in logins}
        for future in tqdm(concurrent.futures.as_completed(futures)):
            try:
                all_data.append(future.result())
            except Exception as e:
                print(f"Error fetching data for a user: {e}")
    return all_data

l_dash = fetch_all_users_data(l_logins)

698it [05:16,  2.21it/s]


In [7]:
len(l_dash)

698

## Converting fetched data to DataFrame and then to users.csv

In [8]:
df = pd.DataFrame()
df = pd.json_normalize(l_dash)

In [9]:
df_final = df.loc[:,["login","name","company","location","email","hireable","bio","public_repos","followers","following","created_at"]]
df_final["company"] = df_final["company"].str.replace("@","")
df_final["company"] = df_final["company"].apply(lambda x: x.strip().upper() if x is not None else np.nan)

In [12]:
df_final.to_csv("users1.csv",index=False)

# Step-III Fetching User Repositories
**First 500 most recently pushed repositories**

In [13]:
def get_repo_details(repo):
    return {
        "login": repo.get("owner").get("login"),
        "full_name": repo.get("name"),
        "created_at": repo.get("created_at"),
        "stargazers_count": repo.get("stargazers_count"),
        "watchers_count": repo.get("watchers_count"),
        "language": repo.get("language"),
        "has_projects": repo.get("has_projects"),
        "has_wiki": repo.get("has_wiki"),
        "license_name": repo.get("license", {}).get("name") if repo.get("license") is not None else None
    }

def fetch_user_repos(user):
    user_repos = []
    count = 0
    url = f"https://api.github.com/users/{user}/repos"
    params = {"sort": "pushed", "direction": "desc", "per_page": 100}
    while count < 5 and url:
        count += 1
        try:
            resp = requests.get(url, headers=header, params=params, timeout=10)
            resp.raise_for_status()
            url = resp.links.get('next', {}).get('url')
            for repo in resp.json():
                user_repos.append(get_repo_details(repo))
            time.sleep(2.5)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching repos for user {user}: {e}")
            break
    return user_repos

def fetch_all_users_repos(users):
    all_repos = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(fetch_user_repos, user): user for user in users}
        for future in tqdm(concurrent.futures.as_completed(futures)):
            try:
                all_repos.extend(future.result())
            except Exception as e:
                print(f"Error fetching repos: {e}")
    return all_repos

l_repos = fetch_all_users_repos(l_logins)

698it [10:50,  1.07it/s]


## Checking if all required repositories are fetched

In [14]:
len(l_repos)

54800

In [15]:
df_final.loc[df_final["public_repos"] < 500,"public_repos"].sum()+df_final.loc[df_final["public_repos"] > 500,"public_repos"].apply(lambda x: 500).sum()

np.int64(54799)

**Hooray!!! Above two lines show that all required repos are fetched**

## Converting fetched data to DataFrame and then to repositories.csv

In [16]:
df_repos = pd.json_normalize(l_repos)

In [17]:
df_repos.to_csv("repositories1.csv",index=False)

# Actionable Insight for users

In [18]:
df_repos["language"].value_counts()[:5]

language
JavaScript    7418
Python        5937
TypeScript    2671
Java          2372
Go            2198
Name: count, dtype: int64