In [None]:
import requests
import csv
import time

# Function to fetch users in Seattle with over 200 followers
def fetch_users(page=1, per_page=100):
    url = f'https://api.github.com/search/users?q=location:Seattle+followers:>200&page={page}&per_page={per_page}'
    headers = {'Authorization': 'token DUMMY'}
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        print(f"Error: {response_data.get('message', 'Unknown error')}")
    return response_data.get('items', [])

# Function to fetch user details
def fetch_user_details(username):
    url = f'https://api.github.com/users/{username}'
    headers = {'Authorization': 'token DUMMY'}
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error fetching details for {username}: {response.json().get('message', 'Unknown error')}")
    return response.json()

# Function to clean up company names
def clean_company_name(company):
    if company:
        company = company.strip().replace('@', '').upper()
    return company or ''


# Function to create CSV file
def create_csv(users):
    with open('users.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Login', 'Name', 'Company', 'Location', 'Email', 'Hireable', 'Bio', 'Public Repos', 'Followers', 'Following', 'Created At'])
        for user in users:
            details = fetch_user_details(user['login'])
            writer.writerow([details['login'], details.get('name', ''), clean_company_name(details.get('company', '')),
                             details.get('location', ''), details.get('email', ''), details.get('hireable', ''),
                             details.get('bio', ''), details['public_repos'], details['followers'], details['following'], details['created_at']])

# Function to get all users across multiple pages
def get_all_users():
    users = []
    page = 1
    while True:
        current_users = fetch_users(page=page)
        if not current_users:
            break
        users.extend(current_users)
        print(f"Fetched {len(current_users)} users on page {page}")
        page += 1
        time.sleep(1)  # To avoid hitting rate limits
    return users

# Fetch all users and create CSV
all_users = get_all_users()
print(f"Total users fetched: {len(all_users)}")
create_csv(all_users)


Fetched 100 users on page 1
Fetched 100 users on page 2
Fetched 100 users on page 3
Fetched 100 users on page 4
Fetched 100 users on page 5
Fetched 18 users on page 6
Total users fetched: 518


In [None]:
# Function to fetch upto 500 recent repos
def fetch_repositories(username):
    url = f'https://api.github.com/users/{username}/repos?per_page=500'
    headers = {'Authorization': 'token DUMMY'}
    response = requests.get(url, headers=headers)
    return response.json()


# Function to create repositories CSV file
def create_repositories_csv(users):
    with open('/content/repositories.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Login', 'Full Name', 'Created At', 'Stargazers Count', 'Watchers Count', 'Language', 'Has Projects', 'Has Wiki', 'License Name'])
        for user in users:
            repos = fetch_repositories(user['login'])
            count = 0
            for repo in repos:
                if count >= 500:
                    break
                license_name = repo['license']['key'] if repo['license'] else ''
                writer.writerow([user['login'], repo['full_name'], repo['created_at'], repo['stargazers_count'], repo['watchers_count'],
                                 repo['language'], repo['has_projects'], repo['has_wiki'], license_name])
                count += 1

create_repositories_csv(all_users)

In [None]:
import pandas as pd
df = pd.read_csv("/content/repositories.csv")

In [None]:
# Most popular programming language
language_counts = df['Language'].value_counts()
print(language_counts)

Language
JavaScript            4752
Python                3485
Ruby                  1434
HTML                  1342
C#                    1330
                      ... 
Forth                    1
mIRC Script              1
Ragel in Ruby Host       1
sed                      1
AutoIt                   1
Name: count, Length: 179, dtype: int64


In [None]:
#the 3 most popular license among these users
language_counts = df['license_name'].value_counts()
print(language_counts)

license_name
mit                   8668
apache-2.0            3994
other                 3202
bsd-3-clause           822
gpl-3.0                618
gpl-2.0                267
bsd-2-clause           253
cc-by-4.0              164
cc0-1.0                162
unlicense              104
isc                     93
agpl-3.0                92
epl-1.0                 82
mpl-2.0                 82
upl-1.0                 66
mit-0                   65
lgpl-3.0                53
lgpl-2.1                39
cc-by-sa-4.0            31
ofl-1.1                 29
wtfpl                   18
bsl-1.0                 14
zlib                     8
epl-2.0                  7
0bsd                     6
ms-pl                    4
bsd-4-clause             3
artistic-2.0             2
osl-3.0                  2
vim                      2
ncsa                     2
bsd-3-clause-clear       1
Name: count, dtype: int64


In [None]:
# languages after 2020
df['created_at'] = pd.to_datetime(df['created_at'])
filtered_df = df[df['created_at'] >= '2021-01-01']

In [None]:
language_counts = filtered_df ['language'].value_counts()
print(language_counts)

language
Python        842
JavaScript    717
TypeScript    663
C#            359
HTML          317
             ... 
Stylus          1
Less            1
Max             1
Elixir          1
BASIC           1
Name: count, Length: 108, dtype: int64


In [None]:
# Find the language with the highest average stars
# Group by language and calculate the average stargazers count
average_stars_per_language = df.groupby('language')['stargazers_count'].mean()
max_avg_stars_language = average_stars_per_language.idxmax()
max_avg_stars_value = average_stars_per_language.max()

print(f"Language with the highest average stars: {max_avg_stars_language}, Average Stars: {max_avg_stars_value:.2f}")

Language with the highest average stars: Vim script, Average Stars: 372.60


In [None]:
# Correlation between projects and wiki enabled
df1 = pd.read_csv("/content/users.csv")
correlation = df['has_wiki'].corr(df['has_projects'])
print(correlation)

0.31944858724553993


In [None]:
#Correlation between followers and repos
import scipy.stats as stats
x = df1['public_repos']
y = df1['followers']

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print(f"Regression slope: {slope}")

Regression slope: 2.4282595517416175


In [None]:
# Average of following per user for hireable=true minus the average following for the rest
avg_following_hireable = df1[df1['hireable'] == True]

avg_following_non_hireable = df1[df1['hireable'] != True]['following'].mean()

diff_avg_following = avg_following_hireable - avg_following_non_hireable

print(f"Difference in average following: {diff_avg_following}")

In [None]:
avg_following_hireable.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at,surname,bio_word_count
4,ahmetb,Ahmet Alp Balkan,LINKEDIN,"Seattle, WA",github@ahmet.im,True,Working on compute orchestration with Kubernet...,221,8212,34,2009-11-28T14:59:59Z,Balkan,9
8,TheLarkInn,Sean Larkin,MICROSOFT MICROSOFTEDGE WEBPACK WEBPACK-CONTRI...,"Seattle, WA",selarkin@microsoft.com,True,"Software Engineer @microsoft. Javascript, webp...",265,6404,71,2013-01-28T17:11:47Z,Larkin,19
20,schollz,Zack,,"Seattle, WA",zack.scholl@gmail.com,True,Software Engineer + Scientist,1078,3077,200,2014-01-31T01:31:11Z,Zack,4
22,aidenybai,Aiden Bai,,portland ↝ seattle,,True,on the search for capybaras ʕ•ᴥ•ʔ,149,2739,32,2018-04-03T03:08:03Z,Bai,6
25,hyperb1iss,Stefanie Jane,MASON,"Seattle, WA",,True,"Light, Sound, and Android 🌠",27,2587,61,2009-07-06T14:26:47Z,Jane,5


In [None]:
# Getting the most common surnames and their count

df1 = df1.dropna(subset=['name'])
df1['surname'] = df1['name'].str.strip().str.split().str[-1]
surname_counts = df1['surname'].value_counts()

max_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

print(f"Most common surname(s): {', '.join(sorted(most_common_surnames))}")
print(f"Number of users with the most common surname: {max_count}")

Most common surname(s): Wang
Number of users with the most common surname: 6


In [None]:
# Filter for repos at  weekends (Saturday = 5, Sunday = 6)

df['created_at'] = pd.to_datetime(df['created_at'], utc=True)
df['day_of_week'] = df['created_at'].dt.dayofweek
weekends = df[df['day_of_week'].isin([5, 6])]

# Count repositories per user created on weekends
weekend_repo_counts = weekends.groupby('login').size().sort_values(ascending=False).head(5)
top_5_users = weekend_repo_counts.index.tolist()

print(f"Top 5 users by repositories created on weekends (UTC): {', '.join(top_5_users)}")

Top 5 users by repositories created on weekends (UTC): svermeulen, homebysix, ryanoasis, eugeneyan, anvaka


In [None]:
weekend_repo_counts

Unnamed: 0_level_0,0
login,Unnamed: 1_level_1
svermeulen,47
homebysix,46
ryanoasis,42
eugeneyan,39
anvaka,39


In [None]:
import unicodedata
import scipy.stats as stats

df1 = df1.dropna(subset=['bio'])

# Calculate word count for each bio using .loc with Unicode handling
df1.loc[:, 'bio_word_count'] = df1['bio'].apply(lambda x: len(unicodedata.normalize('NFC', x).split()))

slope, intercept, r_value, p_value, std_err = stats.linregress( df1['followers'],df1['bio_word_count'])

print(f"Regression slope: {slope:.5f}")

Regression slope: -0.00007


In [None]:
# Fraction of users with email for hireable=true
fraction_hireable_with_email = df1[df1['hireable'] == True]['email'].notna().mean()

# Fraction of users with email for hireable=false or missing
fraction_non_hireable_with_email = df1[df1['hireable'] != True]['email'].notna().mean()

# Calculate the difference
fraction_difference = fraction_hireable_with_email - fraction_non_hireable_with_email

print(f"Difference in fractions: {fraction_difference:.4f}")

Difference in fractions: 0.0894
