In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import requests
import csv
import time

# Set GitHub API BASE URL
git_url = "https://api.github.com"

# Personal Access Token from GitHub
git_token = "Test Token"

# Authentication headers
headers = {
    "Authorization": f"token {git_token}"
}

In [None]:
# Scrape users by passing Shanghai and followers required
def git_users():
    users = []
    page = 1
    while True:
        locationval = 'Shanghai'
        followerslimit = 200
        perpage = 100
        url = f"{git_url}/search/users?q=location:{locationval}+followers:>{followerslimit}&per_page={perpage}&page={page}"
        response = requests.get(url, headers=headers)
        data = response.json()

        # Break if no results
        if 'items' not in data or len(data['items']) == 0:
            break

        # Get additional detailed information for each of the user
        for item in data['items']:
            user_details = get_user_details(item["login"])
            if user_details:
                users.append(user_details)

        # Exception handling
        if response.status_code == 403:
            print("Limit reached. Sleeping for 60 seconds.")
            time.sleep(60)
            continue

        # Next page
        page += 1

    return users

In [None]:
# Scrape additional detailed user information for users
def get_user_details(username):
    url = f"{git_url}/users/{username}"
    response = requests.get(url, headers=headers)

    # Exception handling
    if response.status_code == 403:
        print("Limit reached. Sleeping for 60 seconds.")
        time.sleep(60)
        return get_user_details(username)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch details for user {username}")
        return None

In [None]:
# Scrape up to 500 most recently pushed repositories for users
def get_repos_for_user(username, max_repos=500):
    repos = []
    page = 1
    perpage = 100
    while len(repos) < max_repos:
        url = f"{git_url}/users/{username}/repos?sort=pushed&per_page={perpage}&page={page}"
        response = requests.get(url, headers=headers)
        user_repos = response.json()

        # Break if no results
        if len(user_repos) == 0:
            break

        # Add to repositories and also limiting to max 500 recent repos
        for repo in user_repos:
            if len(repos) >= max_repos:
                break
            repos.append({
                "name": repo["name"],
                "login": repo["owner"]["login"],
                "full_name": repo["full_name"],
                "created_at": repo["created_at"],
                "stargazers_count": repo["stargazers_count"],
                "watchers_count": repo["watchers_count"],
                "language": repo.get("language", ""),
                "has_projects": repo["has_projects"],
                "has_wiki": repo["has_wiki"],
                "license_name": repo["license"]["name"] if repo["license"] else None,
                "html_url": repo["html_url"]
            })

        # Next page
        page += 1

    return repos

In [None]:
# Save users and repos data to csv
def save_to_csv(users, repos):
    # Path to save output
    drive_path = '/content/drive/My Drive/TDS Proj 1/'  # Replace with your desired path

    # Create users.csv
    with open(drive_path + 'users.csv', 'w', newline='', encoding='utf-8') as file:
        create_file = csv.writer(file)
        # Update header row in users.csv
        create_file.writerow([
            "login", "name", "company", "location", "email", "hireable", "bio",
            "public_repos", "followers", "following", "created_at"
        ])
        # Loop each users to update all scraped information into users.csv
        for user in users:
            company = format_value(user.get("company"))  # Get company value
            if company:  # Check if company value is not empty
                company = company.strip().lstrip('@').upper()  # Cleaning up company names

            hireable = format_value(user.get("hireable"))  # Get hierable value
            if not hireable:  # Check if hireable value is empty
                hireable = 'false'  # Update false for all empty cells

            # Format with true and false for booleans
            # Format with empty strings for null
            create_file.writerow([
                user["login"], format_value(user.get("name")),
                company, format_value(user.get("location")),
                format_value(user.get("email")), hireable,
                format_value(user.get("bio")), user["public_repos"],
                user["followers"], user["following"],
                user["created_at"]
            ])
     # Create repositories.csv
    with open(drive_path + 'repositories.csv', 'w', newline='', encoding='utf-8') as file:
        create_file = csv.writer(file)
        # Update header row in repositories.csv
        create_file.writerow([
            "login", "full_name", "created_at", "stargazers_count",
            "watchers_count", "language", "has_projects", "has_wiki", "license_name"
        ])
        # Loop each users to update all scraped information into repositories.csv
        for repo in repos:
            # Format with true and false for booleans
            # Format with empty strings for null
            create_file.writerow([
                repo["login"], repo["full_name"],
                repo["created_at"], repo["stargazers_count"], repo["watchers_count"],
                format_value(repo.get("language")), format_value(repo["has_projects"]),
                format_value(repo["has_wiki"]), format_value(repo.get("license_name"))
            ])

In [None]:
def main():
    # Get all users in Shanghai with over 200 followers
    users = git_users()

    # Get repositories for each user, limiting to 500 repos per user
    repos = []
    for user in users:
        user_repos = get_repos_for_user(user["login"], max_repos=500)
        repos.extend(user_repos)
        time.sleep(1)  # Delay to avoid rate limits

    # Save data to CSV
    save_to_csv(users, repos)
    print("Data saved to users.csv and repositories.csv")

In [None]:
# Function to convert booleans and replace None with empty strings
def format_value(value):
    if isinstance(value, bool):
        return 'true' if value else 'false'
    elif value is None:
        return ''
    else:
        return value

In [None]:
if __name__ == "__main__":
    main()

Data saved to users.csv and repositories.csv


In [None]:
# Function to save README
def save_README():
    # Path to save output
    drive_path = '/content/drive/My Drive/TDS Proj 1/'  # Replace with your desired path

    # Users CSV with required fieldswith open('README.md', 'w') as f:
    with open(drive_path + 'README.md', 'w') as R:
        R.write("""
        Scrape / Data Collection Process:

        -  To scrape/collect the required  data from GitHub, I leveraged the GitHub REST API to gather details about users in Shanghai with over 200 followers and their most recently pushed repositories with maximum limit to 500 repositories

        -  I used my personal access token for authentication to access GITHUB API to avoid rate-limiting issues. The code includes error handling to pause execution in case rate limits are exceeded

        -  To retrieve all relevant users, pagination was implemented to handle the API's limitation of returning a maximum of 100 users per request. The code iterates through pages of results until all matching users are obtained

        -  For each identified user, a separate API call was made to retrieve detailed user information

        -  The 'company' field was cleaned by removing leading/trailing spaces and any '@' symbol at the beginning.

        -  Company names were then converted to uppercase

        -  Additionally, all boolean fields were standardized to contain either 'true' or 'false' values

        -  All extracted user and repository data were stored in separate CSV files named 'users.csv' and 'repositories.csv', respectively



        Some of the Interesting and surprising facts that I see in the data post analyzing are:

        -  While Shanghai's GitHub users are employed across various companies.  Among them users with over 200 followers who have provided company information, roughly 5% are affiliated with ByteDance, a leading Chinese internet technology company

        -  While 29% of the 742 total users are open to hiring opportunities, the majority (71%) appear to be primarily using the platform for skill development and learning

        -  Peng-zhihui leads in followers with 80,714 and 59 repositories, followed closely by ruanyf with 79,328 followers and 72 repositories

        -  In contrast to Peng-zhihui and ruanyf, who have a large following, Hengle possesses the most repositories, with a remarkable count of 11,057

        -  It is surprising to find that the stargazer and watcher counts are uniform across all users

        -  Between 2008 and 2013, the number of repositories created by Shanghai users skyrocketed by over 95 times, from a mere 20 to a staggering 1900+. This reflects the rapid adoption of GitHub and a surge in open-source contributions from the Shanghai developer community

        -  The year 2018 marked the peak of repository creation for Shanghai users, with more new repositories added. This suggests a high level of development activity and innovation within the community during that period.  In the past few years (2019-2024), there has been a noticeable decline in the number of new repositories created, dropping by nearly 39% from the 2018 peak. This trend might indicate a shift in development practices or a saturation point in repository creation



        Recommendation for developers basis my analysis:

        -  Shanghai's developers initially favored JavaScript, Python, and Java, accounting for almost 44% of repositories where language data information exists. However, a clear shift is evident in the last two years, with Python and TypeScript gaining traction as the preferred languages for new projects, even amidst a diverse landscape of 87 different programming languages utilized by developers. Developers should consider learning and utilizing these languages in their projects. This can increase the visibility and relevance of their work within the developer community

        -  Some of the most popular open sources licenses used by developers are MIT License and Apache License 2.0.  MIT and Apache licenses are permissive licenses and widely used in open-source projects. Key differences lie in their handling of patents and trademarks, where Apache has an advantage with more explicti protection

        -  The vast majority of repositories (98%) have projects enabled, and a significant portion of those projects (86%) also include a wiki.  Developers should utilize these features to organize and document their work effectively to improve project clarity and make it easier for other to contribute

        -  Repository names and descriptions play a cruicial role in discoverability and attach collaborators, so need to make it clear, concise, and relevant to the project's content

        -  Only 29% of the Developers updated hiring preference.  Developers who are open to hiring should actively update their profiles and highlight their skills and experience to attract potential employers

        -  Stay informed about emerging technologies, trends, and best practices within the developer community. Follow influential developers and organizations, participate in relevant discussions, and attend workshops or conferences

        """)
    print("Data saved to README.md")

In [None]:
save_README()

Data saved to README.md
