In [15]:
import warnings
warnings.filterwarnings("ignore")

# Data Scraping

In [None]:
import requests
import csv
import time
import os

GITHUB_API_URL = "https://api.github.com"
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") 

HEADERS = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

USER_FIELDS = ["login", "name", "company", "location", "email", "hireable", "bio", 
               "public_repos", "followers", "following", "created_at"]
REPO_FIELDS = ["login", "full_name", "created_at", "stargazers_count", 
               "watchers_count", "language", "has_projects", "has_wiki", "license_name"]

def fetch_users(location, min_followers):
    users = []
    page = 1
    print("Fetching users...")

    while True:
        try:
            response = requests.get(
                f"{GITHUB_API_URL}/search/users",
                headers=HEADERS,
                params={"q": f"location:{location} followers:>{min_followers}", "page": page, "per_page": 30}
            )
            if response.status_code != 200:
                print(f"Error fetching users: {response.status_code}, {response.json().get('message')}")
                break
            
            data = response.json()
            if 'items' not in data or len(data['items']) == 0:
                print("No more users found.")
                break

            users.extend(data['items'])
            print(f"Page {page} fetched. Total users collected: {len(users)}")
            page += 1
            time.sleep(1)  

        except Exception as e:
            print(f"An error occurred while fetching users: {e}")
            break

    return users

def fetch_user_details(login):
    try:
        response = requests.get(f"{GITHUB_API_URL}/users/{login}", headers=HEADERS)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error fetching details for {login}: {response.json().get('message')}")
            return {}
    except Exception as e:
        print(f"An error occurred while fetching details for {login}: {e}")
        return {}

def fetch_user_repositories(login):
    repos = []
    page = 1
    print(f"Fetching repositories for {login}...")

    while True:
        try:
            response = requests.get(
                f"{GITHUB_API_URL}/users/{login}/repos",
                headers=HEADERS,
                params={"sort": "pushed", "page": page, "per_page": 100}
            )
            if response.status_code != 200:
                print(f"Error fetching repositories for {login}: {response.json().get('message')}")
                break
            
            data = response.json()
            if len(data) == 0:
                break

            repos.extend(data)
            print(f"Page {page} of repositories for {login} fetched.")
            page += 1
            time.sleep(1)

            if len(repos) >= 500:
                break

        except Exception as e:
            print(f"An error occurred while fetching repos for {login}: {e}")
            break

    return repos[:500]

def clean_company_name(company):
    return company.strip().lstrip('@').upper() if company else ""

def write_users_to_csv(users):
    with open("users.csv", "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=USER_FIELDS)
        writer.writeheader()
        for user in users:
            writer.writerow({
                "login": user.get("login", ""),
                "name": user.get("name", ""),
                "company": clean_company_name(user.get("company", "")),
                "location": user.get("location", ""),
                "email": user.get("email", ""),
                "hireable": user.get("hireable", ""),
                "bio": user.get("bio", ""),
                "public_repos": user.get("public_repos", 0),
                "followers": user.get("followers", 0),
                "following": user.get("following", 0),
                "created_at": user.get("created_at", "")
            })

def write_repositories_to_csv(repositories):
    with open("repositories.csv", "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=REPO_FIELDS)
        writer.writeheader()
        for repo in repositories:
            writer.writerow({
                "login": repo["owner"]["login"],
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", 0),
                "watchers_count": repo.get("watchers_count", 0),
                "language": repo.get("language", ""),
                "has_projects": repo.get("has_projects", False),
                "has_wiki": repo.get("has_wiki", False),
                "license_name": repo.get("license", {}).get("key", "") if repo.get("license") else ""
            })

def main():
    users = fetch_users('Basel', 10)
    user_details = [fetch_user_details(user["login"]) for user in users]

    write_users_to_csv(user_details)

    all_repositories = []
    for user in user_details:
        login = user["login"]
        repos = fetch_user_repositories(login)
        all_repositories.extend(repos)

    write_repositories_to_csv(all_repositories)
    print("Data scraping completed.")

if __name__ == "__main__":
    main()


Fetching users...
Page 1 fetched. Total users collected: 30
Page 2 fetched. Total users collected: 60
Page 3 fetched. Total users collected: 90
Page 4 fetched. Total users collected: 120
Page 5 fetched. Total users collected: 150
Page 6 fetched. Total users collected: 180
Page 7 fetched. Total users collected: 210
Page 8 fetched. Total users collected: 240
Page 9 fetched. Total users collected: 270
Page 10 fetched. Total users collected: 300
Page 11 fetched. Total users collected: 330
Page 12 fetched. Total users collected: 349
No more users found.
Fetching repositories for tarsius...
Page 1 of repositories for tarsius fetched.
Fetching repositories for aalmiray...
Page 1 of repositories for aalmiray fetched.
Page 2 of repositories for aalmiray fetched.
Page 3 of repositories for aalmiray fetched.
Fetching repositories for marcoroth...
Page 1 of repositories for marcoroth fetched.
Page 2 of repositories for marcoroth fetched.
Fetching repositories for klmr...
Page 1 of repositories for

In [23]:
import pandas as pd

users_df = pd.read_csv("users.csv")

users_df

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,tarsius,Jonas Bernoulli,,"Basel, Switzerland",jonas@bernoul.li,,,39,1345,45,2008-09-17T19:18:37Z
1,aalmiray,Andres Almiray,,"Basel, Switzerland",aalmiray@gmail.com,,I code for fun and help others in the process....,253,887,65,2008-06-17T00:30:56Z
2,marcoroth,Marco Roth,,"Basel, Switzerland",,True,"Rubyist, Full-Stack Devloper and Open Source C...",193,560,1135,2014-01-15T17:12:11Z
3,klmr,Konrad Rudolph,ROCHE,"Basel, CH",konrad.rudolph@gmail.com,,"Geneticist 🧬, computer scientist 𝝺 and softwar...",115,473,109,2008-12-07T20:55:40Z
4,MrNeRF,janusch,,"Basel, Switzerland",,True,Professional C++ Software Engineer. Skills: CU...,34,438,38,2017-11-21T18:17:58Z
...,...,...,...,...,...,...,...,...,...,...,...
344,alvarogonjim,Alvaro Gonzalez-Jimenez,UNIVERSITY OF BASEL,Basel,alvarogonjim95@gmail.com,,,23,11,11,2017-02-09T17:00:18Z
345,palsch,Paul Schell,PRODYNA (SCHWEIZ) AG,Basel,pal.sch@gmx.de,,I am a senior software architect and consultan...,18,11,14,2012-03-21T17:09:18Z
346,dsolsona,Daniel Solsona,BEEKEEPER,Basel,,,,35,11,3,2011-06-26T16:07:22Z
347,frieder,Frieder Heugel,,"Basel, Switzerland",,,,10,11,0,2011-01-31T10:19:09Z


In [24]:
import pandas as pd

repositories_df = pd.read_csv("repositories.csv")

repositories_df

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,tarsius,tarsius/llama,2024-08-10T21:22:27Z,2,2,Emacs Lisp,False,False,gpl-3.0
1,tarsius,tarsius/morlock,2013-06-24T11:07:08Z,13,13,Emacs Lisp,False,False,gpl-3.0
2,tarsius,tarsius/imake,2017-10-06T11:56:19Z,7,7,Emacs Lisp,False,False,gpl-3.0
3,tarsius,tarsius/notmuch-maildir,2024-08-05T16:07:12Z,2,2,Emacs Lisp,False,False,gpl-3.0
4,tarsius,tarsius/bicycle,2017-08-06T11:23:07Z,59,59,Emacs Lisp,False,False,gpl-3.0
...,...,...,...,...,...,...,...,...,...
13961,dpdawson,dpdawson/ruby-brainfuck,2013-08-16T10:40:10Z,0,0,Ruby,True,True,
13962,dpdawson,dpdawson/rails,2013-07-07T20:03:35Z,0,0,Ruby,True,False,
13963,dpdawson,dpdawson/dotvim,2013-07-05T22:40:51Z,0,0,VimL,True,True,
13964,dpdawson,dpdawson/dotfiles,2013-07-09T07:55:09Z,0,0,Shell,True,False,mit


# Question 1

In [None]:
import pandas as pd

users_df = pd.read_csv("users.csv")

top_5_users = users_df.sort_values(by="followers", ascending=False).head(5)["login"]

top_5_users_list = ",".join(top_5_users)

print(top_5_users_list)


tarsius,aalmiray,marcoroth,klmr,MrNeRF


# Question 2

In [None]:
import pandas as pd

users_df = pd.read_csv("users.csv")

users_df['created_at'] = pd.to_datetime(users_df['created_at'])

earliest_users = users_df.sort_values(by="created_at", ascending=True).head(5)["login"]

earliest_users_list = ",".join(earliest_users)

print(earliest_users_list)

bennyzen,aalmiray,pvillega,tarsius,amaunz


# Question 3

In [None]:
import pandas as pd

repos_df = pd.read_csv("repositories.csv")

valid_licenses = repos_df[repos_df['license_name'] != ""]

top_3_licenses = valid_licenses['license_name'].value_counts().head(3)

top_3_licenses_list = ",".join(top_3_licenses.index)

print(top_3_licenses_list)


mit,apache-2.0,other


# Question 4

In [41]:
import pandas as pd

users_df = pd.read_csv("users.csv")

company_counts = users_df['company'].value_counts().head(3)

print(company_counts)

company
ADOBE                  20
UNIVERSITY OF BASEL    20
ROCHE                  12
Name: count, dtype: int64


# Question 5

In [4]:
import pandas as pd

repos_df = pd.read_csv("repositories.csv")

repos_df['language'].value_counts().idxmax()

'JavaScript'

# Question 6

In [5]:
import pandas as pd

users_df = pd.read_csv("users.csv")

users_df['created_at'] = pd.to_datetime(users_df['created_at'])

recent_users = users_df[users_df['created_at'] > '2020-01-01']

repos_df = pd.read_csv("repositories.csv")

merged_df = repos_df[repos_df['login'].isin(recent_users['login'])]

valid_languages = merged_df[merged_df['language'].notna() & (merged_df['language'] != "")]

language_counts = valid_languages['language'].value_counts()

second_most_popular_language = language_counts.index[1] if len(language_counts) > 1 else None

print(second_most_popular_language)


HTML


# Question 7

In [41]:
import pandas as pd

repos_df = pd.read_csv("repositories.csv")

valid_repos = repos_df[repos_df['language'].notna() & (repos_df['language'] != "")]

average_stars = valid_repos.groupby('language')['stargazers_count'].mean()

highest_avg_stars_language = average_stars.idxmax()
highest_avg_stars_count = average_stars.max()

print(highest_avg_stars_language, highest_avg_stars_count)

PureScript 114.0


# Question 8

In [None]:
import pandas as pd

users_df = pd.read_csv("users.csv")

users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

top_5_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)['login']

top_5_leaders_list = ",".join(top_5_leaders)

print(top_5_leaders_list)

dpryan79,wasserth,ravage84,elanmart,quadbiolab


# Question 9

In [4]:
import pandas as pd

users_df = pd.read_csv("users.csv")

users_df['followers'].corr(users_df['public_repos'])

np.float64(0.34491375135763774)

# Question 10

In [12]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

users_df = pd.read_csv("users.csv")

X = users_df[['public_repos']]  
y = users_df['followers']       

model = LinearRegression()
model.fit(X, y)

followers_per_repo = model.coef_[0]

print(followers_per_repo)


0.6737356461766962


# Question 11

In [33]:
import pandas as pd

repos_df = pd.read_csv("repositories.csv")

correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

print(correlation)


0.261897683016074


# Question 12

In [32]:
import pandas as pd

df = pd.read_csv("users.csv")

hireable_following = df[df['hireable'] == True]['following'].mean()

non_hireable_following = df[df['hireable'] != True]['following'].mean()

difference = round(hireable_following - non_hireable_following, 3)

print(difference)

45.869


# Question 13

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

users_df = pd.read_csv("users.csv")

users_with_bio = users_df[users_df['bio'].notna() & (users_df['bio'] != "")]

users_with_bio['bio_length'] = users_with_bio['bio'].apply(lambda x: len(x.split()))

X = users_with_bio[['bio_length']]  
y = users_with_bio['followers']     

model = LinearRegression()
model.fit(X, y)

followers_per_word = model.coef_[0]

print(followers_per_word)


2.400439131719209


# Question 14

In [27]:
import pandas as pd

repos_df = pd.read_csv("repositories.csv")

repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

repos_df['is_weekend'] = repos_df['created_at'].dt.dayofweek.isin([5, 6])

weekend_repo_count = repos_df[repos_df['is_weekend']].groupby('login').size()

top_weekend_users = weekend_repo_count.nlargest(5)

top_weekend_user_logins = ','.join(top_weekend_users.index)

print(top_weekend_user_logins)


dpryan79,syzer,ioolkos,maysam,pvillega


# Question 15

In [None]:
import pandas as pd

users_df = pd.read_csv("users.csv")

fraction_with_email_hireable = users_df[users_df['hireable'] == True]['email'].notna().mean()

fraction_with_email_non_hireable = users_df[users_df['hireable'] != True]['email'].notna().mean()

difference = fraction_with_email_hireable - fraction_with_email_non_hireable

print(difference)


0.06610564200275221


# Question 16

In [None]:
import pandas as pd

users_df = pd.read_csv("users.csv")

users_df = users_df.dropna(subset=['name'])
users_df['surname'] = users_df['name'].str.strip().str.split().str[-1]

surname_counts = users_df['surname'].value_counts()

max_count = surname_counts.max()

most_common_surnames = surname_counts[surname_counts == max_count].index.sort_values()

most_common_surnames = ','.join(most_common_surnames)

print(most_common_surnames)


Arnold,Brand,Christensen,Fink,GmbH,Group,Guggisberg,Landolt,Roth,Tan
