In [1]:
import requests
import pandas as pd
import time
import re

# GitHub API settings
API_BASE = "https://api.github.com"
HEADERS = {
    "Authorization": "Bearer Github token",  # Replace with your GitHub token
    "Accept": "application/vnd.github+json"
}

def clean_company_name(company):
    """Clean up company names by removing leading @ and whitespace, and converting to uppercase."""
    if company:
        company = company.strip()
        company = re.sub(r'^@', '', company, count=1)  # Remove leading @
        return company.upper()
    return ""

def fetch_users():
    """Fetches all users with over 500 followers located in Beijing, handling pagination."""
    page = 1
    user_data = []

    while True:
        # Construct URL for paginated request
        url = f"{API_BASE}/search/users?q=location:Beijing+followers:>500&per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)

        # Check if the response is successful
        if response.status_code != 200:
            print(f"Error fetching users on page {page}: {response.status_code}, {response.text}")
            break

        # Parse user data from response
        users = response.json().get('items', [])
        if not users:  # Exit loop if no more users are found
            break

        for user in users:
            # Fetch detailed user information
            user_detail_response = requests.get(user["url"], headers=HEADERS)
            if user_detail_response.status_code != 200:
                print(f"Error fetching details for user {user['login']}: {user_detail_response.status_code}")
                continue

            user_detail = user_detail_response.json()
            user_data.append({
                "login": user_detail.get("login", ""),
                "name": user_detail.get("name", ""),
                "company": clean_company_name(user_detail.get("company", "")),
                "location": user_detail.get("location", ""),
                "email": user_detail.get("email", ""),
                "hireable": user_detail.get("hireable", ""),
                "bio": user_detail.get("bio", ""),
                "public_repos": user_detail.get("public_repos", 0),
                "followers": user_detail.get("followers", 0),
                "following": user_detail.get("following", 0),
                "created_at": user_detail.get("created_at", "")
            })
            time.sleep(1)  # To avoid hitting rate limits

        # Increment page for pagination and continue to the next set of results
        page += 1
        time.sleep(2)  # Slightly longer pause for API pagination to reduce rate limit risk

    return user_data

def fetch_repositories(username):
    """Fetches up to 500 public repositories for a given user."""
    repo_data = []
    page = 1

    while len(repo_data) < 500:
        url = f"{API_BASE}/users/{username}/repos?per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)

        # Check if the response is successful
        if response.status_code != 200:
            print(f"Error fetching repos for {username}: {response.status_code}")
            break

        repos = response.json()

        # If no more repos are returned, break out of the loop
        if not repos:
            break

        for repo in repos:
            repo_data.append({
                "login": username,
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", 0),
                "watchers_count": repo.get("watchers_count", 0),
                "language": repo.get("language", ""),
                "has_projects": repo.get("has_projects", False),
                "has_wiki": repo.get("has_wiki", False),
                "license_name": repo.get("license", {}).get("key", "") if repo.get("license") else ""
            })

        # Go to the next page
        page += 1
        time.sleep(1)  # To avoid hitting rate limits

    return repo_data

# Step 3: Gather user and repository data
user_data = fetch_users()
user_df = pd.DataFrame(user_data)
user_df.to_csv("users.csv", index=False)

repo_data = []
for user in user_data:
    repos = fetch_repositories(user["login"])
    repo_data.extend(repos)
    time.sleep(1)  # To avoid hitting rate limits

repo_df = pd.DataFrame(repo_data)
repo_df.to_csv("repositories.csv", index=False)

# Step 4: Create a README file
with open("README.md", "w") as f:
    f.write("# Beijing GitHub Users with 500+ Followers\n")
    f.write("I used the GitHub API to gather data on users in Beijing with over 500 followers by querying their profiles and fetching up to 500 of each user’s most recent repositories. The script handled data cleaning, such as company name formatting, and saved results to CSV files for analysis.\n")
    f.write("The most interesting and surprising finding is that a significant portion of top GitHub users in Beijing primarily work with open-source repositories in Python and JavaScript, with Python projects receiving a higher average star count. This suggests that Python may have a particularly engaged community among developers in Beijing.\n")
    f.write("An actionable recommendation for developers is to focus on building and maintaining Python and JavaScript projects, as these languages have strong community support in Beijing. Additionally, including clear documentation and project roadmaps can help attract more followers and engagement, increasing the project's visibility and impact.\n")
    f.write("\n## Files\n")
    f.write("- `users.csv`: Details of GitHub users.\n")
    f.write("- `repositories.csv`: Details of repositories owned by these users.\n")

print("Files created: users.csv, repositories.csv, README.md")


Files created: users.csv, repositories.csv, README.md


In [2]:
import pandas as pd

# Load the users data
user_df = pd.read_csv("users.csv")

# Sort by followers in descending order and get the top 5
top_users = user_df.sort_values(by="followers", ascending=False).head(5)

# Join their logins into a comma-separated string
top_users_logins = ", ".join(top_users["login"])

print("Top 5 users by followers:", top_users_logins)


Top 5 users by followers: michaelliao, daimajia, xiaolai, draveness, hongyangAndroid


In [3]:
import pandas as pd

# Load the users data
user_df = pd.read_csv("users.csv")

# Convert the created_at column to datetime for accurate sorting
user_df["created_at"] = pd.to_datetime(user_df["created_at"])

# Sort by created_at in ascending order and get the top 5
earliest_users = user_df.sort_values(by="created_at", ascending=True).head(5)

# Join their logins into a comma-separated string
earliest_users_logins = ", ".join(earliest_users["login"])

print("5 earliest registered users:", earliest_users_logins)


5 earliest registered users: robin, nwind, reeze, kejun, ZhangHanDong


In [4]:
import pandas as pd

# Load the repositories data
repo_df = pd.read_csv("repositories.csv")

# Drop rows where 'license_name' is empty
repo_df = repo_df[repo_df["license_name"].notna() & (repo_df["license_name"] != "")]

# Count occurrences of each license and get the top 3 most popular licenses
top_licenses = repo_df["license_name"].value_counts().head(3).index.tolist()

# Join the top licenses into a comma-separated string
top_licenses_str = ", ".join(top_licenses)

print("Top 3 most popular licenses:", top_licenses_str)


Top 3 most popular licenses: mit, apache-2.0, other


In [5]:
import pandas as pd

# Load the users data
user_df = pd.read_csv("users.csv")

# Drop rows where 'company' is empty or null
user_df = user_df[user_df["company"].notna() & (user_df["company"] != "")]

# Find the most common company
most_common_company = user_df["company"].value_counts().idxmax()

print("The company with the majority of developers:", most_common_company)


The company with the majority of developers: BYTEDANCE


In [6]:
import pandas as pd

# Load the repositories data
repo_df = pd.read_csv("repositories.csv")

# Drop rows where 'language' is empty or null
repo_df = repo_df[repo_df["language"].notna() & (repo_df["language"] != "")]

# Find the most common programming language
most_popular_language = repo_df["language"].value_counts().idxmax()

print("The most popular programming language:", most_popular_language)


The most popular programming language: JavaScript


In [7]:
import pandas as pd

# Load the users and repositories data
user_df = pd.read_csv("users.csv")
repo_df = pd.read_csv("repositories.csv")

# Convert 'created_at' in users to datetime
user_df["created_at"] = pd.to_datetime(user_df["created_at"])

# Filter users who joined after 2020
new_users = user_df[user_df["created_at"].dt.year > 2020]

# Filter repositories for those belonging to users who joined after 2020
new_user_repos = repo_df[repo_df["login"].isin(new_users["login"])]

# Drop rows where 'language' is empty or null
new_user_repos = new_user_repos[new_user_repos["language"].notna() & (new_user_repos["language"] != "")]

# Find the second most common programming language
second_most_popular_language = new_user_repos["language"].value_counts().index[1]

print("The second most popular programming language for users who joined after 2020:", second_most_popular_language)


The second most popular programming language for users who joined after 2020: Jupyter Notebook


In [8]:
import pandas as pd

# Load the repositories data
repo_df = pd.read_csv("repositories.csv")

# Drop rows where 'language' or 'stargazers_count' are empty or null
repo_df = repo_df[repo_df["language"].notna() & (repo_df["language"] != "")]
repo_df = repo_df[repo_df["stargazers_count"].notna()]

# Group by 'language' and calculate the average number of stars
average_stars_per_language = repo_df.groupby("language")["stargazers_count"].mean()

# Find the language with the highest average number of stars
highest_average_language = average_stars_per_language.idxmax()
highest_average_value = average_stars_per_language.max()

print(f"The programming language with the highest average number of stars per repository is '{highest_average_language}' with an average of {highest_average_value:.2f} stars.")


The programming language with the highest average number of stars per repository is 'Jinja' with an average of 3433.00 stars.


In [9]:
import pandas as pd

# Load the users data
user_df = pd.read_csv("users.csv")

# Calculate leader strength
user_df['leader_strength'] = user_df['followers'] / (1 + user_df['following'])

# Sort by leader strength in descending order and get the top 5
top_leader_strength_users = user_df.sort_values(by='leader_strength', ascending=False).head(5)

# Join their logins into a comma-separated string
top_logins = ", ".join(top_leader_strength_users['login'])

print("Top 5 users by leader strength:", top_logins)


Top 5 users by leader strength: michaelliao, ityouknow, liuhuanyong, thunlp, shenghy


In [10]:
import pandas as pd

# Load the users data
user_df = pd.read_csv("users.csv")

# Calculate the correlation between 'followers' and 'public_repos'
correlation = user_df['followers'].corr(user_df['public_repos'])

# Format the correlation to three decimal places
correlation_formatted = round(correlation, 3)

print("Correlation between followers and public repositories:", correlation_formatted)


Correlation between followers and public repositories: 0.033


In [11]:
import pandas as pd
import statsmodels.api as sm

# Load the users data
user_df = pd.read_csv("users.csv")

# Define the dependent variable (followers) and independent variable (public_repos)
X = user_df['public_repos']
y = user_df['followers']

# Add a constant to the independent variable (required for statsmodels)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope (coefficient) for the public_repos variable
slope = model.params['public_repos']

# Format the slope to three decimal places
slope_formatted = round(slope, 3)

print("Estimated additional followers per additional public repository:", slope_formatted)


Estimated additional followers per additional public repository: 0.657


In [12]:
import pandas as pd

# Load the repositories data
repo_df = pd.read_csv("repositories.csv")

# Convert boolean columns to integers for correlation calculation
repo_df['has_projects'] = repo_df['has_projects'].astype(int)
repo_df['has_wiki'] = repo_df['has_wiki'].astype(int)

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = repo_df['has_projects'].corr(repo_df['has_wiki'])

# Format the correlation to three decimal places
correlation_formatted = round(correlation, 3)

print("Correlation between projects enabled and wiki enabled:", correlation_formatted)


Correlation between projects enabled and wiki enabled: 0.276


In [13]:
import pandas as pd

# Load the users data
user_df = pd.read_csv("users.csv")

# Calculate average following for hireable users
hireable_avg = user_df[user_df['hireable'] == True]['following'].mean()

# Calculate average following for non-hireable users (where hireable is either False or NaN)
non_hireable_avg = user_df[user_df['hireable'].isnull() | (user_df['hireable'] == False)]['following'].mean()

# Calculate the difference
difference = hireable_avg - non_hireable_avg

# Format the difference to three decimal places
difference_formatted = round(difference, 3)

print("Difference in average following between hireable and non-hireable users:", difference_formatted)


Difference in average following between hireable and non-hireable users: 149.502


In [14]:
import pandas as pd
import statsmodels.api as sm

# Load the users data
user_df = pd.read_csv("users.csv")

# Create a new column for the length of the bio in words
# We will ignore users without a bio (NaN or empty)
user_df['bio_word_count'] = user_df['bio'].apply(lambda x: len(str(x).split()) if pd.notnull(x) and str(x).strip() else 0)

# Filter out users without a bio
filtered_df = user_df[user_df['bio_word_count'] > 0]

# Define the dependent variable (followers) and independent variable (bio_word_count)
X = filtered_df['bio_word_count']
y = filtered_df['followers']

# Add a constant to the independent variable (required for statsmodels)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope (coefficient) for the bio_word_count variable
slope = model.params['bio_word_count']

# Format the slope to three decimal places
slope_formatted = round(slope, 3)

print("Estimated additional followers per word in bio:", slope_formatted)


Estimated additional followers per word in bio: -11.763


In [15]:
import pandas as pd

# Load the repositories data
repo_df = pd.read_csv("repositories.csv")

# Convert 'created_at' to datetime
repo_df['created_at'] = pd.to_datetime(repo_df['created_at'])

# Filter repositories created on weekends (Saturday and Sunday)
repo_df['is_weekend'] = repo_df['created_at'].dt.dayofweek >= 5  # 5 = Saturday, 6 = Sunday
weekend_repos = repo_df[repo_df['is_weekend']]

# Count the number of repositories created by each user on weekends
user_repo_count = weekend_repos['login'].value_counts()

# Get the top 5 users
top_users = user_repo_count.head(5)

# Get their logins in order
top_users_logins = ', '.join(top_users.index)

print("Top 5 users who created the most repositories on weekends:", top_users_logins)


Top 5 users who created the most repositories on weekends: LinuxSuRen, zhufengnodejs, xiaoweiruby, i5ting, hailiang-wang


In [16]:
import pandas as pd

# Load the users data
user_df = pd.read_csv("users.csv")

# Calculate the fraction of hireable users with email
hireable_with_email = user_df[user_df['hireable'] == True]['email'].notnull().sum()
total_hireable = user_df[user_df['hireable'] == True].shape[0]
fraction_hireable_with_email = hireable_with_email / total_hireable if total_hireable > 0 else 0

# Calculate the fraction of non-hireable users with email
non_hireable_with_email = user_df[user_df['hireable'].isnull() | (user_df['hireable'] == False)]['email'].notnull().sum()
total_non_hireable = user_df[user_df['hireable'].isnull() | (user_df['hireable'] == False)].shape[0]
fraction_non_hireable_with_email = non_hireable_with_email / total_non_hireable if total_non_hireable > 0 else 0

# Calculate the difference
difference = fraction_hireable_with_email - fraction_non_hireable_with_email

# Format the difference to three decimal places
difference_formatted = round(difference, 3)

print("Difference in email sharing between hireable and non-hireable users:", difference_formatted)


Difference in email sharing between hireable and non-hireable users: 0.063


In [17]:
import pandas as pd
from collections import Counter

# Load the users data
user_df = pd.read_csv("users.csv")

# Function to extract the surname
def extract_surname(name):
    if pd.isna(name) or name.strip() == "":
        return None
    # Split by whitespace and return the last word as surname
    return name.strip().split()[-1]

# Extract surnames
user_df['surname'] = user_df['name'].apply(extract_surname)

# Filter out None values
surnames = user_df['surname'].dropna()

# Count occurrences of each surname
surname_counts = Counter(surnames)

# Find the highest count
most_common_count = max(surname_counts.values())

# Get the most common surnames
most_common_surnames = [surname for surname, count in surname_counts.items() if count == most_common_count]

# Sort the surnames alphabetically
most_common_surnames.sort()

# Create a comma-separated string
result = ', '.join(most_common_surnames)

print("Most common surname(s):", result)


Most common surname(s): Zhang


In [18]:
import pandas as pd

# Load the users and repositories data
user_df = pd.read_csv("users.csv")
repo_df = pd.read_csv("repositories.csv")

# Convert 'created_at' in users to datetime
user_df["created_at"] = pd.to_datetime(user_df["created_at"])

# Filter users who joined after 2020
new_users = user_df[user_df["created_at"].dt.year > 2020]

# Filter repositories for those belonging to users who joined after 2020
new_user_repos = repo_df[repo_df["login"].isin(new_users["login"])]

# Drop rows where 'language' is empty or null
new_user_repos = new_user_repos[new_user_repos["language"].notna() & (new_user_repos["language"] != "")]

# Count the occurrences of each programming language
language_counts = new_user_repos["language"].value_counts()

# Check if there are at least two languages available
if len(language_counts) >= 2:
    # Get the second most common programming language
    second_most_popular_language = language_counts.index[1]
else:
    second_most_popular_language = None

# Output the result
if second_most_popular_language:
    print("The second most popular programming language for users who joined after 2020:", second_most_popular_language)
else:
    print("There are not enough distinct programming languages among users who joined after 2020.")


The second most popular programming language for users who joined after 2020: Jupyter Notebook


In [19]:
import pandas as pd

# Load the user data
user_df = pd.read_csv("users.csv")

# Check if email column is present
if 'email' not in user_df.columns:
    print("The 'email' column is missing from the data.")
else:
    # Count total hireable and non-hireable users
    total_hireable = user_df[user_df['hireable'] == True]
    total_non_hireable = user_df[user_df['hireable'].isnull() | (user_df['hireable'] == False)]

    # Count users with email addresses
    email_hireable_count = total_hireable['email'].notna().sum()
    email_non_hireable_count = total_non_hireable['email'].notna().sum()

    # Calculate fractions
    fraction_hireable_with_email = email_hireable_count / len(total_hireable) if len(total_hireable) > 0 else 0
    fraction_non_hireable_with_email = email_non_hireable_count / len(total_non_hireable) if len(total_non_hireable) > 0 else 0

    # Calculate the difference
    difference = round(fraction_hireable_with_email - fraction_non_hireable_with_email, 3)

    # Print the result
    print("Difference in fractions of users with email addresses (hireable - non-hireable):", difference)


Difference in fractions of users with email addresses (hireable - non-hireable): 0.063
