In [23]:
import requests
import csv
import time

# GitHub API token
GITHUB_TOKEN = 'SECRET_TOKEN'
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

# Helper function to clean up company names
def clean_company_name(company):
    if company:
        company = company.strip().lstrip('@').upper()
    return company

# Function to fetch users from the GitHub API
def fetch_users(city="Toronto", min_followers=100):
    users = []
    page = 1

    while True:
        url = f"https://api.github.com/search/users?q=location:{city}+followers:>{min_followers}&page={page}&per_page=100"
        response = requests.get(url, headers=HEADERS)
        data = response.json()

        # Break if no more results
        if 'items' not in data or not data['items']:
            break

        for user in data['items']:
            # Get full user info
            user_url = user['url']
            user_response = requests.get(user_url, headers=HEADERS)
            user_data = user_response.json()

            # Extract required fields
            users.append({
                'login': user_data['login'],
                'name': user_data['name'],
                'company': clean_company_name(user_data['company']),
                'location': user_data['location'],
                'email': user_data['email'],
                'hireable': user_data['hireable'],
                'bio': user_data['bio'],
                'public_repos': user_data['public_repos'],
                'followers': user_data['followers'],
                'following': user_data['following'],
                'created_at': user_data['created_at'],
            })
        page += 1
        time.sleep(1)  # Avoid hitting API rate limits

    return users

# Function to fetch repositories for a user
def fetch_repositories(user_login):
    repositories = []
    page = 1

    while True:
        url = f"https://api.github.com/users/{user_login}/repos?per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        repo_data = response.json()

        # Break if no more repositories
        if not repo_data:
            break

        for repo in repo_data:
            repositories.append({
                'login': user_login,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })

        # If fewer than 100 repositories are returned, it means we're on the last page
        if len(repo_data) < 100:
            break

        page += 1  # Move to the next page
        time.sleep(1)  # Avoid hitting API rate limits

    return repositories

# Save users to CSV
def save_users_to_csv(users, filename="users.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=users[0].keys())
        writer.writeheader()
        writer.writerows(users)

# Save repositories to CSV
def save_repositories_to_csv(repositories, filename="repositories_unformat.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=repositories[0].keys())
        writer.writeheader()
        writer.writerows(repositories)

def main():
    print("Fetching users...")
    users = fetch_users()
    save_users_to_csv(users)
    print(f"Saved {len(users)} users to users.csv")

    print("Fetching repositories...")
    all_repositories = []
    for user in users:
        user_repos = fetch_repositories(user["login"])
        all_repositories.extend(user_repos)
        print(f"Fetched {len(user_repos)} repositories for user {user['login']}")

    save_repositories_to_csv(all_repositories)
    print(f"Saved {len(all_repositories)} repositories to repositories_unformat.csv")

if __name__ == "__main__":
    main()

Fetching users...
Saved 685 users to users.csv
Fetching repositories...
Fetched 145 repositories for user aneagoie
Fetched 143 repositories for user ZhangMYihua
Fetched 34 repositories for user susanli2016
Fetched 88 repositories for user thedaviddias
Fetched 32 repositories for user ange-yaghi
Fetched 31 repositories for user nayuki
Fetched 128 repositories for user stemmlerjs
Fetched 184 repositories for user GrapheneOS
Fetched 335 repositories for user MylesBorins
Fetched 158 repositories for user vsavkin
Fetched 38 repositories for user kevinjycui
Fetched 60 repositories for user hlissner
Fetched 16 repositories for user gazijarin
Fetched 23 repositories for user priya-dwivedi
Fetched 1 repositories for user dmitshur
Fetched 14 repositories for user rspivak
Fetched 69 repositories for user daattali
Fetched 51 repositories for user chenyuntc
Fetched 2 repositories for user jmorganca
Fetched 135 repositories for user petertodd
Fetched 490 repositories for user vladikoff
Fetched 175 r

In [26]:
import pandas as pd

# Load the repositories.csv file
repositories = pd.read_csv('repositories_unformat.csv')

# Columns that need conversion
columns_to_change = ['has_projects', 'has_wiki']

# Change 'True' to 'true' and 'False' to 'false' in specified columns
for column in columns_to_change:
    repositories[column] = repositories[column].replace({True: 'true', False: 'false'})

# Save the updated file
repositories.to_csv('repositories.csv', index=False)

print("Updated file saved as 'repositories.csv'")


Updated file saved as 'repositories_updated.csv'


In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [53]:
# 1
users = pd.read_csv('users.csv')
users.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,aneagoie,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10279,1,2015-01-30T17:05:43Z
1,ZhangMYihua,Yihua Zhang,,Toronto,yihuazhang2@gmail.com,False,Toronto Software Developer,143,5801,11,2015-01-18T00:01:02Z
2,susanli2016,Susan Li,,Toronto Canada,,False,Chief Data Scientist,34,4921,68,2016-11-28T04:22:39Z
3,thedaviddias,David Dias,KIJIJICA,"Toronto, Canada",,False,💻 Passionate Front-End Dev & 🎨 UI/UX fan. Cont...,88,4546,303,2010-04-05T14:40:12Z
4,ange-yaghi,Ange Yaghi,,Toronto,me@angeyaghi.com,False,C++ Developer,32,4023,11,2016-07-13T21:01:21Z


In [54]:
users['hireable'] = users['hireable'].fillna(False).astype(bool)

In [55]:
top5 = users.sort_values(by='followers', ascending=False).head()
print(','.join(top5['login'].tolist()))

aneagoie,ZhangMYihua,susanli2016,thedaviddias,ange-yaghi


In [56]:
# 2
users['created_at'] = pd.to_datetime(users['created_at'])

In [57]:
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))

jamesmacaulay,michaelklishin,myles,nwjsmith,vito


In [58]:
# 3
repos = pd.read_csv('repositories.csv')
repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,aneagoie,aneagoie/3D_roll,2017-01-08T20:05:27Z,0,0,JavaScript,True,True,
1,aneagoie,aneagoie/advanced-React-Redux-Training-Jeopardy,2017-01-31T20:34:22Z,1,1,CSS,True,True,
2,aneagoie,aneagoie/advanced-redux-training-RoboDex,2017-01-25T15:05:15Z,1,1,JavaScript,True,True,
3,aneagoie,aneagoie/AirBnB-template,2015-06-19T18:00:33Z,1,1,HTML,True,True,
4,aneagoie,aneagoie/ajaxCalls,2015-06-11T14:22:59Z,0,0,JavaScript,True,True,


In [59]:
repos['license_name'].value_counts().head(3)


Unnamed: 0_level_0,count
license_name,Unnamed: 1_level_1
mit,14797
other,4790
apache-2.0,4579


In [60]:
# 4
users['company'].value_counts().head(1)

Unnamed: 0_level_0,count
company,Unnamed: 1_level_1
UNIVERSITY OF TORONTO,21


In [61]:
# 5
repos['language'].value_counts().head(1)

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,11210


In [62]:
# 6
users_after_2020 = users[users['created_at'] > '2020-01-01']
users_after_2020.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
52,enderh3art,Jesse Zhou,,Toronto,,False,,5,923,7,2020-04-28 05:36:17+00:00
62,iceburgcrm,Iceburg CRM,,"Toronto, Canada",,False,Iceburg CRM,5,714,19957,2022-11-11 21:59:06+00:00
79,HamedBahram,Hamed Bahram,STUDIO-HB,"Toronto, CA",,True,Software developer & content creator from Cana...,87,578,3,2020-04-22 13:56:33+00:00
94,scrumtuous,Darcy DeClute,SCRUMTUOUS,"Toronto, ON",,False,I'm a certified Scrum Master and AWS Cloud Pra...,16,535,90,2021-12-17 18:56:47+00:00
105,barry-far,bardiafa,MIZEGERD-TECH,"Toronto, ON",,False,"""Don't let negativity dim your light; use it a...",5,490,10,2023-09-04 06:25:14+00:00


In [63]:
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,338
TypeScript,205
Python,161
HTML,122
CSS,61


In [64]:
# 7
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

Cython 1780.6666666666667


In [65]:
# 8
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

aneagoie,nayuki,GrapheneOS,hlissner,rspivak


In [66]:
# 9
correlation = users['followers'].corr(users['public_repos'])
correlation

0.05547950303867636

In [67]:
# 10
import csv
followers = []
public_repos = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers_count = int(row['followers'])
        public_repos_count = int(row['public_repos'])
        followers.append(followers_count)
        public_repos.append(public_repos_count)
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)

    print(f"{slope:.3f}")
else:
    print("Error")

0.253


In [68]:
# 11
if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})

correlation = repos['has_projects'].corr(repos['has_wiki'])

print(round(correlation, 3))
# # Convert values if necessary
# repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False}).astype(bool)
# repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False}).astype(bool)

# # Drop any NaN values in these columns before correlation
# repos_filtered = repos[['has_projects', 'has_wiki']].dropna()

# # Calculate the correlation
# correlation = repos_filtered['has_projects'].corr(repos_filtered['has_wiki'])
# print(round(correlation, 3))


0.353


In [51]:
# 12
# hireable_avg_following = users[users['hireable'] == True]['following'].mean()
# non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
# difference = hireable_avg_following - non_hireable_avg_following
# difference
# Calculate average following for hireable users
hireable_avg_following = users[users['hireable'] == True]['following'].mean()

# Calculate average following for non-hireable users (where hireable is not True or is blank)
non_hireable_avg_following = users[users['hireable'] != True]['following'].mean()

# Calculate the difference
difference = hireable_avg_following - non_hireable_avg_following

# Display the result rounded to 3 decimal places
round(difference, 3)

-13.382

In [73]:
# 13
from sklearn.linear_model import LinearRegression
users_with_bio = users[(users['bio'].notna()) & (users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]





1.4041986504876804

In [70]:
# 14
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])

            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

n1ckfg,jsoref,QuinntyneBrown,invokethreatguy,andyw8


In [71]:
# 15
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff
# Calculate the fraction of hireable users who have an email
# hireable_with_email = users[(users['hireable'] == True) & (users['email'].notna())].shape[0]
# total_hireable = users[users['hireable'] == True].shape[0]
# fraction_hireable = hireable_with_email / total_hireable if total_hireable > 0 else 0

# # Calculate the fraction of non-hireable users who have an email
# non_hireable_with_email = users[(users['hireable'] == False) & (users['email'].notna())].shape[0]
# total_non_hireable = users[users['hireable'] == False].shape[0]
# fraction_non_hireable = non_hireable_with_email / total_non_hireable if total_non_hireable > 0 else 0

# # Calculate the difference
# diff = fraction_hireable - fraction_non_hireable
# print("Difference in email fraction (hireable - non-hireable):", round(diff, 3))



0.13476138828633405

In [72]:
# 16
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Ahmed
