In [72]:
# Every request to the REST API includes an HTTP method and a path. Depending on the REST API endpoint, you might also need to specify request headers,
# authentication information, query parameters, or body parameters.

import requests
import csv
import time
import os
import re
import pandas as pd

In [73]:
# GitHub API setup
GITHUB_API_URL = "https://api.github.com"
ACCESS_TOKEN = "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"
HEADERS = {"Accept": "application/vnd.github+json",'Authorization': f'token {ACCESS_TOKEN}'}
OUTPUT_DIR = "/content/iitm_project1"
if not os.path.exists(OUTPUT_DIR):
  os.makedirs(OUTPUT_DIR, exist_ok=True)

In [75]:
# Fetch users with specified criteria
def fetch_users(city, min_followers):
    users = []
    page = 1
    while True:
        query = f"location:{city} followers:>{min_followers}"
        response = requests.get(
            f"{GITHUB_API_URL}/search/users",
            headers=HEADERS,
            params={"q": query, "per_page": 30, "page": page}
        )
        data = response.json().get("items", [])
        if not data:
            break
        for user in data:
            user_details = requests.get(f"{GITHUB_API_URL}/users/{user['login']}", headers=HEADERS).json()
            users.append({
                "login": user_details.get("login", ""),
                "name": user_details.get("name", ""),
                "company": clean_company(user_details.get("company", "")),
                "location": user_details.get("location", ""),
                "email": user_details.get("email", ""),
                "hireable": user_details.get("hireable", ""),
                "bio": user_details.get("bio", ""),
                "public_repos": user_details.get("public_repos", ""),
                "followers": user_details.get("followers", ""),
                "following": user_details.get("following", ""),
                "created_at": user_details.get("created_at", "")
            })
        page += 1
        time.sleep(1)
    return users

In [76]:
# Fetch repositories for each user
def fetch_repositories(username):
    repos = []
    page = 1
    while page <= 5:  # Limit to 500 repos (5 pages * 100 repos)
        response = requests.get(
            f"{GITHUB_API_URL}/users/{username}/repos",
            headers=HEADERS,
            params={"per_page": 100, "page": page}
        )
        data = response.json()
        if not data:
            break
        for repo in data:
            repos.append({
                "login": username,
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", ""),
                "watchers_count": repo.get("watchers_count", ""),
                "language": repo.get("language", ""),
                "has_projects": repo.get("has_projects", ""),
                "has_wiki": repo.get("has_wiki", ""),
                "license_name": repo.get("license", {}).get("key", "") if repo.get("license") else ""
            })
        page += 1
        time.sleep(1)
    return repos

In [77]:
# Write user data to CSV
def write_users_to_csv(users):
    with open(os.path.join(OUTPUT_DIR, "users.csv"), "w", newline='') as user_file:
        fieldnames = ["login", "name", "company", "location", "email", "hireable", "bio", "public_repos", "followers", "following", "created_at"]
        writer = csv.DictWriter(user_file, fieldnames=fieldnames)
        writer.writeheader()
        for user in users:
            writer.writerow(user)

In [78]:
# Write repository data to CSV
def write_repos_to_csv(repositories):
    with open(os.path.join(OUTPUT_DIR, "repositories.csv"), "w", newline='') as repo_file:
        fieldnames = ["login", "full_name", "created_at", "stargazers_count", "watchers_count", "language", "has_projects", "has_wiki", "license_name"]
        writer = csv.DictWriter(repo_file, fieldnames=fieldnames)
        writer.writeheader()
        for repo in repositories:
            writer.writerow(repo)

In [80]:
# Main execution
def main():
    city = "Berlin"
    min_followers = 200
    users = fetch_users(city, min_followers)
    write_users_to_csv(users)

    repositories = []
    for user in users:
        repos = fetch_repositories(user["login"])
        repositories.extend(repos)
    write_repos_to_csv(repositories)

    readme_file = create_readme()

In [81]:
if __name__ == "__main__":
    main()

In [82]:
# load user & repo datasets

user_df = pd.read_csv("/content/iitm_project1/users.csv")
repo_df = pd.read_csv("/content/iitm_project1/repositories.csv")

In [None]:
user_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,tiangolo,Sebastián Ramírez,,"Berlin, Germany",tiangolo@gmail.com,True,"Creator of FastAPI, Typer, SQLModel, Asyncer, ...",73,26467,3,2012-01-12T22:37:04Z
1,schacon,Scott Chacon,GITBUTLERAPP,"Berlin, Germany",schacon@gmail.com,,,215,13758,26,2008-01-27T17:19:28Z
2,rwieruch,Robin Wieruch,,Berlin/Remote,,True,React & Next.js • JavaScript & TypeScript • Fr...,151,8622,30,2012-10-03T15:11:48Z
3,shuding,Shu Ding,VERCEL,Berlin,g@shud.in,,Be curious. Read widely. Try new things. — aar...,149,6763,345,2013-02-23T07:46:30Z
4,android10,Fernando Cejas,PEPPR-IO,"Berlin, Germany",android10@fernandocejas.com,True,Quantum Engineering at @Qruise-ai. Former Dire...,79,6714,85,2012-01-20T21:35:31Z


In [None]:
user_df.shape

(602, 11)

In [83]:
repo_df.shape

(60509, 9)

In [84]:
repo_df.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,tiangolo,tiangolo/a2wsgi,2024-01-07T20:24:07Z,10,10,,True,True,apache-2.0
1,tiangolo,tiangolo/alembic,2020-05-22T09:50:31Z,5,5,,True,True,mit
2,tiangolo,tiangolo/anaconda_cluster_install,2015-03-11T14:58:44Z,5,5,Shell,True,True,
3,tiangolo,tiangolo/angular-docker-multi-stage-example,2017-10-02T18:43:28Z,15,15,,True,True,
4,tiangolo,tiangolo/annotated-types,2023-08-27T14:32:00Z,8,8,,True,False,mit


In [86]:
top_users = user_df.nlargest(5, 'followers')['login']
print(",".join(top_users))

tiangolo,schacon,rwieruch,shuding,android10


In [87]:
earliest_users = user_df.sort_values('created_at').head(5)['login']
print(",".join(earliest_users))

schacon,adamwiggins,myobie,lstoll,znarf


In [88]:
popular_licenses = repo_df['license_name'].dropna().value_counts().head(3).index
print(",".join(popular_licenses))

mit,apache-2.0,other


In [92]:
# Find the most common company

user_df['company'] = user_df['company'].str.strip().str.lstrip('@').str.upper()

top_company = user_df['company'].dropna().mode()[0]
print(top_company)

MICROSOFT


In [93]:
# Find the most common programming language

popular_language = repo_df['language'].dropna().mode()[0]
print(popular_language)

JavaScript


In [96]:
# Find the second most common programming language

recent_users = user_df[user_df['created_at'] > '2020-01-01']['login']

recent_repos = repo_df[repo_df['login'].isin(recent_users)]

second_popular_language = recent_repos['language'].dropna().value_counts().index[1]
print(second_popular_language)

JavaScript


In [97]:
# Find the language with the highest average stars

avg_stars_per_language = repo_df.groupby('language')['stargazers_count'].mean().dropna()
top_language = avg_stars_per_language.idxmax()
print(top_language)

Fluent


In [99]:
# Find the top 5 users by leader_strength

user_df['leader_strength'] = user_df['followers'] / (1 + user_df['following'])

top_leaders = user_df.nlargest(5, 'leader_strength')['login']
print(",".join(top_leaders))

tiangolo,marijnh,vakila,alexeygrigorev,lewagon


In [101]:
# correlation between followers and public_repos
correlation = user_df['followers'].corr(user_df['public_repos'])
print(f"{correlation:.3f}")

0.017


In [102]:
import statsmodels.api as sm

# Define the independent variable (public_repos) and dependent variable (followers)
X = user_df['public_repos']
y = user_df['followers']

# Add a constant to the independent variable
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope coefficient (for public_repos)
slope = model.params['public_repos']
print(f"{slope:.3f}")

0.284


In [131]:
# correlation between having projects enabled and having wiki enabled
correlation_projects_wiki = repo_df['has_projects'].astype(int).corr(repo_df['has_wiki'].astype(int))
print(f"{correlation_projects_wiki:.3f}")

0.400


In [143]:
avg_following_hireable = user_df[user_df['hireable'] == True]['following'].mean()
avg_following_rest = user_df[user_df['hireable'] != True]['following'].mean()

difference = avg_following_hireable - avg_following_rest
print(f"{difference:.3f}")

47.050


In [151]:
user_df['bio_length'] = user_df['bio'].dropna().apply(lambda x: len(x.split()))

filtered_users = user_df[user_df['bio_length'].notna()]

X = filtered_users['bio_length']
y = filtered_users['followers']

X = sm.add_constant(X)

model_bio = sm.OLS(y, X).fit()

slope_bio = model_bio.params['bio_length']
print(f"{slope_bio:.3f}")

28.453


In [112]:
repo_df['created_at'] = pd.to_datetime(repo_df['created_at'])
repo_df['weekday'] = repo_df['created_at'].dt.weekday

weekend_repos = repo_df[repo_df['weekday'].isin([5, 6])]

repo_counts = weekend_repos['login'].value_counts().head(5)

top_users = repo_counts.index
print(",".join(top_users))

derhuerst,janpio,saschanaz,jamesmunns,sunsided


In [157]:
fraction_hireable = user_df[user_df['hireable'] == True]['email'].notna().mean()
fraction_non_hireable = user_df[user_df['hireable'].isna()]['email'].notna().mean()

email_difference = fraction_hireable - fraction_non_hireable
print(f"{email_difference:.3f}")

-0.008


In [117]:
user_df['surname'] = user_df['name'].dropna().apply(lambda x: x.strip().split()[-1])
common_surnames = user_df['surname'].value_counts()

most_common_count = common_surnames.max()
most_common_surnames = common_surnames[common_surnames == most_common_count].index.sort_values()

print(",".join(most_common_surnames), most_common_count)

Schneider 3


In [159]:
user_df['surname'].value_counts()

Unnamed: 0_level_0,count
surname,Unnamed: 1_level_1
Schneider,3
Oliveira,2
Li,2
Honnibal,2
Schmidt,2
...,...
Potapov,1
Millan,1
Smith,1
berstend̡̲̫̹̠̖͚͓̔̄̓̐̄͛̀͘,1
