In [4]:
import requests
import csv
import time
import os

GITHUB_TOKEN = "token"
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

def get_rate_limit():
    response = requests.get("https://api.github.com/rate_limit", headers=HEADERS, timeout=10)
    rate_limit_info = response.json()
    remaining_requests = rate_limit_info['rate']['remaining']
    reset_time = rate_limit_info['rate']['reset']
    print(f"Remaining requests: {remaining_requests}")
    print(f"Rate limit resets at: {time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(reset_time))}")

def get_users_in_city(city="Toronto", min_followers=100):
    users = []
    query = f"location:{city}+followers:>{min_followers}"
    page = 1
    per_page = 100
    total_users = 0

    while True:
        url = f"https://api.github.com/search/users?q={query}&per_page={per_page}&page={page}"

        # Check the rate limit before making a request
        get_rate_limit()

        response = requests.get(url, headers=HEADERS, timeout=10)
        print(f"Fetching page {page}...")

        if response.status_code != 200:
            print("Error fetching data:", response.json())
            break

        data = response.json()
        users.extend(data['items'])
        total_users += len(data['items'])

        # Debug: print the total number of users fetched so far
        print(f"Total users fetched so far: {total_users}")

        if len(data['items']) < per_page:
            break

        page += 1
        time.sleep(2)  # Rate limiting - increased delay

    detailed_users = [get_user_details(user['login']) for user in users]
    return detailed_users

def get_user_details(username):
    print(f"Fetching details for user: {username}")
    user_url = f"https://api.github.com/users/{username}"
    try:
        user_data = requests.get(user_url, headers=HEADERS, timeout=10).json()
    except requests.exceptions.Timeout:
        print(f"Timeout occurred while fetching user details for {username}")
        return {}

    return {
        'login': user_data.get('login', ''),
        'name': user_data.get('name', ''),
        'company': clean_company_name(user_data.get('company')),
        'location': user_data.get('location', ''),
        'email': user_data.get('email', ''),
        'hireable': user_data.get('hireable', ''),
        'bio': user_data.get('bio', ''),
        'public_repos': user_data.get('public_repos', 0),
        'followers': user_data.get('followers', 0),
        'following': user_data.get('following', 0),
        'created_at': user_data.get('created_at', ''),
    }

def clean_company_name(company):
    if company:
        company = company.strip().upper()
        if company.startswith('@'):
            company = company[1:]
    return company or ""

def get_user_repos(username):
    print(f"Fetching repositories for user: {username}")
    repos = []
    page = 1
    per_page = 100
    while True:
        repos_url = f"https://api.github.com/users/{username}/repos?per_page={per_page}&page={page}"
        try:
            response = requests.get(repos_url, headers=HEADERS, timeout=10)
        except requests.exceptions.Timeout:
            print(f"Timeout occurred while fetching repos for user: {username}")
            break

        if response.status_code != 200:
            print("Error fetching repos for user:", username)
            break

        repos_data = response.json()
        if not repos_data:
            break

        for repo in repos_data:
            repos.append({
                'login': username,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo.get('language', ''),
                'has_projects': repo.get('has_projects', False),
                'has_wiki': repo.get('has_wiki', False),
                'license_name': repo['license']['key'] if repo.get('license') else '',
            })

        page += 1
        time.sleep(2)  # Rate limiting - increased delay

    return repos

def save_to_csv(data, filename, fieldnames):
    os.makedirs("output", exist_ok=True)
    filepath = os.path.join("output", filename)
    with open(filepath, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

if __name__ == "__main__":
    users = get_users_in_city("Toronto", 100)
    save_to_csv(users, 'users.csv', [
        'login', 'name', 'company', 'location', 'email', 'hireable', 'bio',
        'public_repos', 'followers', 'following', 'created_at'
    ])

    all_repos = []
    for user in users:
        repos = get_user_repos(user['login'])
        all_repos.extend(repos)

    save_to_csv(all_repos, 'repositories.csv', [
        'login', 'full_name', 'created_at', 'stargazers_count', 'watchers_count',
        'language', 'has_projects', 'has_wiki', 'license_name'
    ])
    print("Data saved successfully!")


Remaining requests: 2491
Rate limit resets at: 2024-10-28 16:21:02
Fetching page 1...
Total users fetched so far: 100
Remaining requests: 2491
Rate limit resets at: 2024-10-28 16:21:02
Fetching page 2...
Total users fetched so far: 200
Remaining requests: 2491
Rate limit resets at: 2024-10-28 16:21:02
Fetching page 3...
Total users fetched so far: 300
Remaining requests: 2491
Rate limit resets at: 2024-10-28 16:21:02
Fetching page 4...
Total users fetched so far: 400
Remaining requests: 2491
Rate limit resets at: 2024-10-28 16:21:02
Fetching page 5...
Total users fetched so far: 500
Remaining requests: 2491
Rate limit resets at: 2024-10-28 16:21:02
Fetching page 6...
Total users fetched so far: 600
Remaining requests: 2491
Rate limit resets at: 2024-10-28 16:21:02
Fetching page 7...
Total users fetched so far: 685
Fetching details for user: aneagoie
Fetching details for user: ZhangMYihua
Fetching details for user: susanli2016
Fetching details for user: thedaviddias
Fetching details for

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
# 1
users = pd.read_csv('output/users.csv')
users.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,aneagoie,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10275,1,2015-01-30T17:05:43Z
1,ZhangMYihua,Yihua Zhang,,Toronto,yihuazhang2@gmail.com,,Toronto Software Developer,143,5801,11,2015-01-18T00:01:02Z
2,susanli2016,Susan Li,,Toronto Canada,,,Chief Data Scientist,34,4920,68,2016-11-28T04:22:39Z
3,thedaviddias,David Dias,KIJIJICA,"Toronto, Canada",,,💻 Passionate Front-End Dev & 🎨 UI/UX fan. Cont...,88,4543,302,2010-04-05T14:40:12Z
4,ange-yaghi,Ange Yaghi,,Toronto,me@angeyaghi.com,,C++ Developer,32,4023,11,2016-07-13T21:01:21Z


In [7]:
users['hireable'] = users['hireable'].fillna(False).astype(bool)

  users['hireable'] = users['hireable'].fillna(False).astype(bool)


In [8]:
top5 = users.sort_values(by='followers', ascending=False).head()
print(','.join(top5['login'].tolist()))

aneagoie,ZhangMYihua,susanli2016,thedaviddias,ange-yaghi


In [9]:
# 2
users['created_at'] = pd.to_datetime(users['created_at'])

In [10]:
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))

jamesmacaulay,michaelklishin,myles,nwjsmith,vito


In [12]:
# 3
repos = pd.read_csv('output/repositories.csv')
repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,aneagoie,aneagoie/3D_roll,2017-01-08T20:05:27Z,0,0,JavaScript,True,True,
1,aneagoie,aneagoie/advanced-React-Redux-Training-Jeopardy,2017-01-31T20:34:22Z,1,1,CSS,True,True,
2,aneagoie,aneagoie/advanced-redux-training-RoboDex,2017-01-25T15:05:15Z,1,1,JavaScript,True,True,
3,aneagoie,aneagoie/AirBnB-template,2015-06-19T18:00:33Z,1,1,HTML,True,True,
4,aneagoie,aneagoie/ajaxCalls,2015-06-11T14:22:59Z,0,0,JavaScript,True,True,


In [13]:
repos['license_name'].value_counts().head(3)

Unnamed: 0_level_0,count
license_name,Unnamed: 1_level_1
mit,14795
other,4789
apache-2.0,4579


In [14]:
# 4
users['company'].value_counts().head(1)

Unnamed: 0_level_0,count
company,Unnamed: 1_level_1
UNIVERSITY OF TORONTO,21


In [15]:
# 5
repos['language'].value_counts().head(1)

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,11211


In [16]:
# 6
users_after_2020 = users[users['created_at'] > '2020-01-01']
users_after_2020.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
52,enderh3art,Jesse Zhou,,Toronto,,False,,5,922,7,2020-04-28 05:36:17+00:00
62,iceburgcrm,Iceburg CRM,,"Toronto, Canada",,False,Iceburg CRM,5,714,19957,2022-11-11 21:59:06+00:00
79,HamedBahram,Hamed Bahram,STUDIO-HB,"Toronto, CA",,True,Software developer & content creator from Cana...,87,577,3,2020-04-22 13:56:33+00:00
93,scrumtuous,Darcy DeClute,SCRUMTUOUS,"Toronto, ON",,False,I'm a certified Scrum Master and AWS Cloud Pra...,16,535,90,2021-12-17 18:56:47+00:00
105,barry-far,bardiafa,MIZEGERD-TECH,"Toronto, ON",,False,"""Don't let negativity dim your light; use it a...",5,489,10,2023-09-04 06:25:14+00:00


In [17]:
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,338
TypeScript,205
Python,161
HTML,122
CSS,61


In [18]:
# 7
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

Cython 1780.6666666666667


In [19]:
# 8
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

aneagoie,nayuki,GrapheneOS,hlissner,rspivak


In [20]:
# 9
correlation = users['followers'].corr(users['public_repos'])
correlation

0.055494208605130095

In [21]:
# 10
import csv
followers = []
public_repos = []
with open('output/users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers_count = int(row['followers'])
        public_repos_count = int(row['public_repos'])
        followers.append(followers_count)
        public_repos.append(public_repos_count)
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)

    print(f"{slope:.3f}")
else:
    print("Error")

0.253


In [22]:
# 11
if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})

correlation = repos['has_projects'].corr(repos['has_wiki'])

print(round(correlation, 3))

0.353


In [23]:
# 12
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
difference

-13.248392469786182

In [24]:
# 13
from sklearn.linear_model import LinearRegression
users_with_bio = users[(users['bio'].notna()) & (users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]

1.407117262498666

In [25]:
# 14
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('output/repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])

            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

n1ckfg,jsoref,QuinntyneBrown,invokethreatguy,andyw8


In [26]:
# 15
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

0.13476138828633405

In [27]:
# 16
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Ahmed
