In [33]:
import requests
import csv
import time

# GitHub API base URL
BASE_URL = "https://api.github.com"

# Personal Access Token from GitHub
GITTOKEN = "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"

# Headers for authentication
HEADERS = {
    "Authorization": f"token {GITTOKEN}"
}

In [34]:
# Function to fetch users in Shanghai with over 200 followers
def get_users_in_shanghai():
    users = []
    page = 1
    while True:
        url = f"{BASE_URL}/search/users?q=location:Shanghai+followers:>200&per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        data = response.json()

        # Break for no more results
        if 'items' not in data or len(data['items']) == 0:
            break

        # Get detailed info for each user
        for item in data['items']:
            user_details = get_user_details(item["login"])
            if user_details:
                users.append(user_details)

        # Rate limit check and exception handling
        if response.status_code == 403:
            print("Rate limit reached. Sleeping for 60 seconds.")
            time.sleep(60)
            continue

        # Next page
        page += 1

    return users

In [35]:
# Function to get detailed information for a specific user
def get_user_details(username):
    url = f"{BASE_URL}/users/{username}"
    response = requests.get(url, headers=HEADERS)

    # Handle rate limiting
    if response.status_code == 403:
        print("Rate limit reached. Sleeping for 60 seconds.")
        time.sleep(60)
        return get_user_details(username)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch details for user {username}")
        return None

In [36]:
# Function to get up to 500 most recently pushed repositories for a user
def get_repos_for_user(username, max_repos=500):
    repos = []
    page = 1
    while len(repos) < max_repos:
        url = f"{BASE_URL}/users/{username}/repos?sort=pushed&per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        user_repos = response.json()

        # Break for no more results
        if len(user_repos) == 0:
            break

        # Append repositories, respecting max limit
        for repo in user_repos:
            if len(repos) >= max_repos:
                break
            repos.append({
                "name": repo["name"],
                "login": repo["owner"]["login"],
                "full_name": repo["full_name"],
                "created_at": repo["created_at"],
                "stargazers_count": repo["stargazers_count"],
                "watchers_count": repo["watchers_count"],
                "language": repo.get("language", ""),
                "has_projects": repo["has_projects"],
                "has_wiki": repo["has_wiki"],
                "license_name": repo["license"]["name"] if repo["license"] else None,
                "html_url": repo["html_url"]
            })

        # Next page
        page += 1

    return repos

In [37]:
# Function to save users and repos data to CSV
def save_to_csv(users, repos):
    # Users CSV with required fields
    with open('users.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([
            "login", "name", "company", "location", "email", "hireable", "bio",
            "public_repos", "followers", "following", "created_at"
        ])
        for user in users:
            company = format_value(user.get("company"))  # Get company value
            if company:  # Check if company value is not empty
                company = company.strip().lstrip('@').upper()  # Apply modifications
            writer.writerow([
                user["login"], format_value(user.get("name")),
                company, format_value(user.get("location")),
                format_value(user.get("email")), format_value(user.get("hireable")),
                format_value(user.get("bio")), user["public_repos"],
                user["followers"], user["following"],
                user["created_at"]
            ])

    # Repositories CSV with required fields
    with open('repos.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([
            "login", "full_name", "created_at", "stargazers_count",
            "watchers_count", "language", "has_projects", "has_wiki", "license_name"
        ])
        for repo in repos:
            writer.writerow([
                repo["login"], repo["full_name"],
                repo["created_at"], repo["stargazers_count"], repo["watchers_count"],
                format_value(repo.get("language")), format_value(repo["has_projects"]),
                format_value(repo["has_wiki"]), format_value(repo.get("license_name"))
            ])

In [38]:
def main():
    # Get all users in Shanghai with over 200 followers
    users = get_users_in_shanghai()

    # Get repositories for each user, limiting to 500 repos per user
    repos = []
    for user in users:
        user_repos = get_repos_for_user(user["login"], max_repos=500)
        repos.extend(user_repos)
        time.sleep(1)  # Delay to avoid rate limits

    # Save data to CSV
    save_to_csv(users, repos)
    print("Data saved to users.csv and repos.csv")

In [31]:
# Function to convert booleans and replace None with empty strings
def format_value(value):
    if isinstance(value, bool):
        return 'true' if value else 'false'
    elif value is None:
        return ''
    else:
        return value

In [39]:
if __name__ == "__main__":
    main()

Data saved to users.csv and repos.csv


In [48]:
# Function to save README
def save_README():
    # Users CSV with required fieldswith open('README.md', 'w') as f:
    with open('README.md', 'w') as R:
      R.write("""
      - This project scrapes data from the GitHub API to analyze users and repositories in Shanghai.
      - It focuses on users with over 200 followers and their public repositories.
      - The data is stored in two CSV files: users.csv and repositories.csv.
      """)
      print("Data saved to README.md")

In [49]:
save_README()

Data saved to README.md
