In [2]:
import requests
import pandas as pd
import time
import datetime

# Set the API token directly
api_token = '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'  # Replace with your actual token

# GitHub API URL
base_url = 'https://api.github.com'
headers = {'Authorization': f'token {api_token}'}

# Track start time
start_time = time.time()

# Rate limit checker
def check_rate_limit():
    rate_url = f"{base_url}/rate_limit"
    response = requests.get(rate_url, headers=headers)
    return response.json()

users_data = []
page = 1

while True:
    # Check rate limit
    rate_limit_info = check_rate_limit()
    remaining_requests = rate_limit_info['rate']['remaining']
    reset_time = rate_limit_info['rate']['reset']

    if remaining_requests < 1:
        wait_time = reset_time - int(time.time()) + 1
        print(f"Rate limit exceeded. Waiting for {wait_time} seconds.")
        time.sleep(wait_time)

    # Fetch users
    users_url = f"{base_url}/search/users?q=location:Sydney+followers:>100&page={page}&per_page=100"
    response = requests.get(users_url, headers=headers)

    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        break

    data = response.json()

    if 'items' not in data or not data['items']:
        break

    users_data.extend(data['items'])
    page += 1

# Extract user info
users = []
for user in users_data:
    user_detail_url = user['url']
    user_response = requests.get(user_detail_url, headers=headers)
    user_info = user_response.json()

    # Clean up company name
    company = user_info.get('company', '')
    if company:
        company = company.strip(' ').lstrip('@').upper()

    users.append({
        'login': user_info['login'],
        'name': user_info['name'],
        'company': company,
        'location': user_info['location'],
        'email': user_info['email'],
        'hireable': 'true' if user_info['hireable'] else 'false',
        'bio': user_info['bio'],
        'public_repos': user_info['public_repos'],
        'followers': user_info['followers'],
        'following': user_info['following'],
        'created_at': user_info['created_at']
    })

# Convert to DataFrame and save as CSV
users_df = pd.DataFrame(users)
users_csv_path = 'users.csv'
users_df.to_csv(users_csv_path, index=False)

# Fetch repositories for each user
repos = []
for user in users:
    page = 1
    user_repos = []
    while True:
        repos_url = f"{base_url}/users/{user['login']}/repos?sort=pushed&direction=desc&page={page}&per_page=100"
        repos_response = requests.get(repos_url, headers=headers)
        repos_data = repos_response.json()

        if not repos_data or len(user_repos) >= 500:
            break

        for repo in repos_data:
            if len(user_repos) >= 500:
                break

            user_repos.append({
                'login': user['login'],
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': 'true' if repo['has_projects'] else 'false',
                'has_wiki': 'true' if repo['has_wiki'] else 'false',
                'license_name': repo['license']['name'] if repo['license'] else None
            })

        page += 1
    repos.extend(user_repos)

# Convert to DataFrame and save as CSV
repos_df = pd.DataFrame(repos)
repos_csv_path = 'repositories.csv'
repos_df.to_csv(repos_csv_path, index=False)

# Track end time
end_time = time.time()
execution_time = str(datetime.timedelta(seconds=(end_time - start_time)))

print(f"Data scraping and file creation completed in {execution_time}.")
print("Users.csv and repositories.csv are available for download.")

# Provide download options
from IPython.display import FileLink

display(FileLink(users_csv_path, result_html_prefix="Download users.csv: "))
display(FileLink(repos_csv_path, result_html_prefix="Download repositories.csv: "))


Data scraping and file creation completed in 0:09:08.174934.
Users.csv and repositories.csv are available for download.


In [3]:
import pandas as pd
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
import numpy as np
import statsmodels.api as sm

# Load the CSV files
users_df = pd.read_csv('users.csv')
repositories_df = pd.read_csv('repositories.csv')

# Ensure 'created_at' in users_df is in datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'], errors='coerce')

# Question 1: Top 5 users in Sydney with the highest number of followers
top_5_followers = users_df.nlargest(5, 'followers')['login'].tolist()
print("1. Top 5 users by followers:", ','.join(top_5_followers))

# Question 2: 5 earliest registered GitHub users in Sydney
earliest_5_users = users_df.nsmallest(5, 'created_at')['login'].tolist()
print("2. 5 earliest registered users:", ','.join(earliest_5_users))

# Question 3: 3 most popular licenses among these users
popular_licenses = repositories_df['license_name'].value_counts().nlargest(3).index.tolist()
print("3. 3 most popular licenses:", ','.join(popular_licenses))

# Question 4: Company with the majority of developers
majority_company = users_df['company'].str.strip().str.upper().mode()[0]
print("4. Majority company:", majority_company)

# Question 5: Most popular programming language among users
popular_language = repositories_df['language'].mode()[0]  # Make sure to check the correct column name here
print("5. Most popular programming language:", popular_language)

# Question 6: Second most popular programming language among users who joined after 2020
recent_users_repos = repositories_df[repositories_df['login'].isin(users_df[users_df['created_at'] > '2020-01-01']['login'])]
second_popular_language = recent_users_repos['language'].value_counts().nlargest(2).index[-1]
print("6. Second most popular language among users who joined after 2020:", second_popular_language)

# Question 7: Language with the highest average number of stars per repository
avg_stars_per_language = repositories_df.groupby('language')['stargazers_count'].mean().idxmax()  # Make sure to check the correct column name here
print("7. Language with highest average stars per repository:", avg_stars_per_language)

# Question 8: Top 5 users by leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leader_strength = users_df.nlargest(5, 'leader_strength')['login'].tolist()
print("8. Top 5 users by leader strength:", ','.join(top_5_leader_strength))

# Question 9: Correlation between followers and public repositories
correlation_followers_repos = users_df['followers'].corr(users_df['public_repos'])
print(f"9. Correlation between followers and public repos: {correlation_followers_repos:.3f}")

# Question 10: Regression slope of followers on repos
X = users_df['public_repos'].values.reshape(-1, 1)
y = users_df['followers'].values
model = LinearRegression().fit(X, y)
regression_slope = model.coef_[0]
print(f"10. Regression slope of followers on repos: {regression_slope:.3f}")

# Question 11: Correlation between projects and wiki enabled
projects_wiki_correlation = repositories_df['has_projects'].corr(repositories_df['has_wiki'])
print(f"11. Correlation between projects and wiki: {projects_wiki_correlation:.3f}")

# Question 12: Average of following per user for hireable
hireable_avg_following = users_df[users_df['hireable'] == True]['following'].mean()
non_hireable_avg_following = users_df[users_df['hireable'] == False]['following'].mean()
following_difference = hireable_avg_following - non_hireable_avg_following
print(f"12. Average following difference for hireable users: {following_difference:.3f}")

# Question 13: Regression slope of followers on bio word count
users_with_bio = users_df.dropna(subset=['bio']).copy()
users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))
X = users_with_bio['bio_word_count']
y = users_with_bio['followers']
X = sm.add_constant(X)
bio_model = sm.OLS(y, X).fit()
bio_slope = bio_model.params['bio_word_count']
print(f"13. Regression slope of followers on bio word count: {bio_slope:.3f}")

# Question 14: Top 5 users who created the most repositories on weekends (UTC)
repositories_df['created_at'] = pd.to_datetime(repositories_df['created_at'])
repositories_df['is_weekend'] = repositories_df['created_at'].dt.dayofweek >= 5
weekend_repos_count = repositories_df[repositories_df['is_weekend']].groupby('login').size()
top_5_weekend_users = weekend_repos_count.nlargest(5).index.tolist()
print("14. Top 5 users creating most repos on weekends:", ','.join(top_5_weekend_users))

# Question 15: Hireable users sharing their email more often
hireable_with_email = users_df[users_df['hireable'] == True]['email'].notna().mean()
non_hireable_with_email = users_df[users_df['hireable'] == False]['email'].notna().mean()
email_difference = hireable_with_email - non_hireable_with_email
print(f"15. Email sharing difference for hireable users: {email_difference:.3f}")

# Question 16: Most common surname(s)
users_df['surname'] = users_df['name'].apply(lambda x: str(x).strip().split()[-1].capitalize() if isinstance(x, str) and x.strip() else None)
surname_counts = users_df['surname'].value_counts().dropna()
max_surname_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == max_surname_count].index.tolist()
most_common_surnames.sort()
print("16. Most common surname(s):", ','.join(most_common_surnames))
print("Number of users with the most common surname:", max_surname_count)


1. Top 5 users by followers: nicknochnack,brendangregg,cornflourblue,0vm,davecheney
2. 5 earliest registered users: dylanegan,cjheath,freshtonic,dhowden,mikel
3. 3 most popular licenses: MIT License,Other,Apache License 2.0
4. Majority company: ATLASSIAN
5. Most popular programming language: JavaScript
6. Second most popular language among users who joined after 2020: TypeScript
7. Language with highest average stars per repository: Mermaid
8. Top 5 users by leader strength: brendangregg,cornflourblue,Canva,nicknochnack,0vm
9. Correlation between followers and public repos: 0.035
10. Regression slope of followers on repos: 0.068
11. Correlation between projects and wiki: 0.251
12. Average following difference for hireable users: 54.408
13. Regression slope of followers on bio word count: -10.946
14. Top 5 users creating most repos on weekends: timgates42,pinkforest,johndpope,mvandermeulen,mikeyhodl
15. Email sharing difference for hireable users: 0.051
16. Most common surname(s): Wu,Zh

In [4]:
from google.colab import files

# Download the users.csv file
files.download('users.csv')

# Download the repositories.csv file
files.download('repositories.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>