In [None]:
import requests
import csv
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

# GitHub API token and headers
#*********************************************************************************
GITHUB_TOKEN = ""      ############### PUT   YOUR   TOKEN      HERE ##############
CITY='Chicago'    ############### PUT   YOUR   CITY      HERE ##############
FOLLOWERS=100     ############### PUT   YOUR   FOLLOWERS      HERE ##############
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

# Helper function to clean up company names
def clean_company_name(company):
    return company.strip().lstrip('@').upper() if company else None

# Function to fetch users from the GitHub API
def fetch_users(city=CITY, min_followers=FOLLOWERS):
    users = []
    page = 1
    while True:
        url = f"https://api.github.com/search/users?q=location:{city}+followers:>{min_followers}&page={page}&per_page=100"
        response = requests.get(url, headers=HEADERS)
        data = response.json()

        if 'items' not in data or not data['items']:
            break  # Stop if no users found

        for user in data['items']:
            # Get full user info
            user_url = user['url']
            user_response = requests.get(user_url, headers=HEADERS)
            user_data = user_response.json()

            # Extract required fields
            users.append({
                'login': user_data['login'],
                'name': user_data['name'],
                'company': clean_company_name(user_data['company']),
                'location': user_data['location'],
                'email': user_data['email'],
                'hireable': user_data['hireable'],
                'bio': user_data['bio'],
                'public_repos': user_data['public_repos'],
                'followers': user_data['followers'],
                'following': user_data['following'],
                'created_at': user_data['created_at'],
            })

        page += 1
        time.sleep(1)  # To avoid hitting rate limits

    return users

# Function to fetch repositories for a user with a max limit of 500
def fetch_repositories(user_login, max_repos=500):
    repositories = []
    page = 1
    while len(repositories) < max_repos:
        url = f"https://api.github.com/users/{user_login}/repos?sort=pushed&per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        repo_data = response.json()

        if not repo_data:
            break

        for repo in repo_data[:max_repos - len(repositories)]:
            repositories.append({
                'login': user_login,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })

        if len(repo_data) < 100:
            break

        page += 1
        time.sleep(0.5)  # Minimal delay to avoid hitting rate limits

    return repositories

# Function to save data to CSV
def save_to_csv(data, filename):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

def main():
    print("Fetching users...")
    users = fetch_users()
    save_to_csv(users, "users.csv")
    users_df = pd.read_csv('users.csv')

    # Check the data types and structure
    print(users_df.head())

    # Replace True/False with true/false in the hireable column
    users_df['hireable'] = users_df['hireable'].replace({True: 'true', False: 'false'})

    # Save the modified DataFrame back to the same CSV file
    users_df.to_csv('users.csv', index=False)

    # Check the data types and structure
    print(users_df.head())


    #print("Updated CSV file saved successfully.")
    print(f"Saved {len(users)} users to users.csv")

    print("Fetching repositories in parallel...")
    all_repositories = []

    # Use ThreadPoolExecutor to fetch repositories for multiple users concurrently
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(fetch_repositories, user["login"]): user for user in users}
        for future in as_completed(futures):
            user = futures[future]
            try:
                user_repos = future.result()
                all_repositories.extend(user_repos)
                print(f"Fetched {len(user_repos)} repositories for user {user['login']}")
            except Exception as e:
                print(f"Error fetching repositories for {user['login']}: {e}")


    save_to_csv(all_repositories, "repositories.csv")
    repositories_df = pd.read_csv('repositories.csv')

    # Check the data types and structure
    print(repositories_df.head())

    # Replace True/False with true/false
    repositories_df['has_projects'] = repositories_df['has_projects'].replace({True: 'true', False: 'false'})
    repositories_df['has_wiki'] = repositories_df['has_wiki'].replace({True: 'true', False: 'false'})

    # Save the modified DataFrame back to the same CSV file
    repositories_df.to_csv('repositories.csv', index=False)

    # Check the data types and structure
    print(repositories_df.head())

    #print("Updated CSV file saved successfully.")
    print(f"Saved {len(all_repositories)} repositories to repositories.csv")

if __name__ == "__main__":
    main()


Fetching users...
         login              name                     company     location  \
0     cassidoo  Cassidy Williams                      GITHUB  Chicago, IL   
1     felangel     Felix Angelov               SHOREBIRDTECH      Chicago   
2       dabeaz     David Beazley                 DABEAZ, LLC      Chicago   
3  sstephenson    Sam Stephenson                         NaN      Chicago   
4  mattgodbolt      Matt Godbolt  AQUATIC CAPITAL MANAGEMENT  Chicago, IL   

                  email hireable  \
0                   NaN      NaN   
1  felangelov@gmail.com      NaN   
2       dave@dabeaz.com      NaN   
3          sam@sls.name      NaN   
4      matt@godbolt.org      NaN   

                                                 bio  public_repos  followers  \
0            Making memes and dreams... and software           165      13382   
1  software engineer by day, software engineer by...           125       8678   
2  Author of the Python Essential Reference (Addi...       

In [3]:
#2
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv("users.csv")

# Ensure that 'created_at' is in datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort by 'created_at' in ascending order
sorted_users = users_df.sort_values(by='created_at', ascending=True)

# Select the first 5 users' 'login' values
earliest_users = sorted_users['login'].head(5).tolist()

# Join the logins into a comma-separated string
result = ", ".join(earliest_users)

print("Earliest registered users:", result)



Earliest registered users: ELLIOTTCABLE, trevorturk, lukehoersten, djspiewak, shanesveller


In [4]:
#3
import pandas as pd

# Load the repositories data from the CSV file
repos_df = pd.read_csv("repositories.csv")

# Filter out rows where 'license_name' is missing
filtered_repos = repos_df[repos_df['license_name'].notna()]

# Count occurrences of each license
license_counts = filtered_repos['license_name'].value_counts()

# Get the top 3 most popular licenses
top_licenses = license_counts.head(3).index.tolist()

# Join the licenses into a comma-separated string
result = ", ".join(top_licenses)

print("Top 3 most popular licenses:", result)


Top 3 most popular licenses: mit, other, apache-2.0


In [5]:
#4
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv("users.csv")

# Function to clean company names
def clean_company_name(name):
    if pd.isna(name):  # Handle missing values
        return ""
    name = name.strip()  # Trim whitespace
    if name.startswith("@"):
        name = name[1:]  # Remove leading '@'
    return name.upper()  # Convert to uppercase

# Apply the cleaning function to the 'company' column
users_df['company'] = users_df['company'].apply(clean_company_name)

# Filter out empty company entries
filtered_companies = users_df[users_df['company'] != ""]

# Count occurrences of each company
company_counts = filtered_companies['company'].value_counts()

# Identify the company with the highest count
most_common_company = company_counts.idxmax()

print("Company with the majority of developers:", most_common_company)


Company with the majority of developers: UNIVERSITY OF CHICAGO


In [6]:
#5
import pandas as pd

# Load the repositories data
repos_df = pd.read_csv("repositories.csv")

# Filter out rows where the language is missing
filtered_languages = repos_df[repos_df['language'].notna()]

# Count occurrences of each language
language_counts = filtered_languages['language'].value_counts()

# Get the most popular language
most_popular_language = language_counts.idxmax()

print("Most popular programming language:", most_popular_language)


Most popular programming language: JavaScript


In [7]:
#6
import pandas as pd

# Load the data
users_df = pd.read_csv("users.csv")
repos_df = pd.read_csv("repositories.csv")

# Convert 'created_at' to datetime format and filter users who joined after 2020
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
recent_users = users_df[users_df['created_at'] > '2020-01-01']

# Filter repositories for these users
filtered_repos = repos_df[repos_df['login'].isin(recent_users['login'])]

# Count occurrences of each language, ignoring missing languages
language_counts = filtered_repos['language'].dropna().value_counts()

# Get the second most popular language
second_most_popular_language = language_counts.index[1] if len(language_counts) > 1 else None

print("Second most popular programming language among users who joined after 2020:", second_most_popular_language)


Second most popular programming language among users who joined after 2020: JavaScript


In [8]:
#7
import pandas as pd

# Load the repositories data
repos_df = pd.read_csv("repositories.csv")

# Filter out rows with missing language or stargazers_count values
repos_df = repos_df.dropna(subset=['language', 'stargazers_count'])

# Group by language and calculate the average number of stars per repository
avg_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()

# Find the language with the highest average stars
most_popular_language = avg_stars_per_language.idxmax()
highest_avg_stars = avg_stars_per_language.max()

print("Language with the highest average stars per repository:", most_popular_language)
print("Average stars:", round(highest_avg_stars, 3))


Language with the highest average stars per repository: Vim Script
Average stars: 647.682


In [None]:
#8
import pandas as pd

# Load the users data
users_df = pd.read_csv('users.csv')

# Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and select the top 5 users
top_5_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Print the top 5 users' logins as a comma-separated string
top_5_logins = ", ".join(top_5_leaders['login'])
print(f"Top 5 users in terms of leader_strength: {top_5_logins}")


Top 5 users in terms of leader_strength: dabeaz, sstephenson, khan4019, adashofdata, djspiewak


In [None]:
#9
import pandas as pd

# Load the users data
users_df = pd.read_csv('users.csv')

# Filter for users in Chicago
chicago_users = users_df[users_df['location'].str.contains('Chicago', case=False, na=False)]

# Calculate the correlation between followers and public repositories
correlation = chicago_users['followers'].corr(chicago_users['public_repos'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between followers and public repositories: {correlation:.3f}")



Correlation between followers and public repositories: 0.077


In [None]:
#10
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

# Load the users data
users_df = pd.read_csv('users.csv')

# Filter for users in Chicago (if required)
chicago_users = users_df[users_df['location'].str.contains('Chicago', case=False, na=False)]

# Reshape the data for sklearn's LinearRegression
X = chicago_users['public_repos'].values.reshape(-1, 1)
y = chicago_users['followers'].values

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Get the regression slope (followers per additional public repository)
slope = model.coef_[0]

# Print the slope rounded to 3 decimal places
print(f"Regression slope of followers on repos: {slope:.3f}")


Regression slope of followers on repos: 0.617


In [None]:
#11
import pandas as pd

# Load the repositories data
repositories_df = pd.read_csv('repositories.csv')

# Convert boolean columns to integers for correlation calculation
repositories_df['has_projects'] = repositories_df['has_projects'].astype(int)
repositories_df['has_wiki'] = repositories_df['has_wiki'].astype(int)

# Calculate the correlation between projects and wiki enabled
correlation = repositories_df['has_projects'].corr(repositories_df['has_wiki'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between projects and wiki enabled: {correlation:.3f}")



Correlation between projects and wiki enabled: 0.288


In [None]:
#12
import pandas as pd

def analyze_following_difference(users_csv_path='users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)

    # Calculate average following for hireable users
    hireable_following = df[df['hireable'] == True]['following'].mean()

    # Calculate average following for non-hireable users
    non_hireable_following = df[df['hireable'] != True]['following'].mean()

    # Calculate the difference rounded to 3 decimal places
    difference = round(hireable_following - non_hireable_following, 3)

    # Print debug information
    print(f"Number of hireable users: {len(df[df['hireable'] == True])}")
    print(f"Number of non-hireable users: {len(df[df['hireable'] != True])}")
    print(f"Average following for hireable users: {hireable_following:.3f}")
    print(f"Average following for non-hireable users: {non_hireable_following:.3f}")

    return difference

# Calculate the difference
result = analyze_following_difference()
print(f"\nDifference in average following: {result:.3f}")

Number of hireable users: 94
Number of non-hireable users: 282
Average following for hireable users: 214.936
Average following for non-hireable users: 103.248

Difference in average following: 111.688


In [None]:
#13
import pandas as pd
import statsmodels.api as sm

def analyze_bio_impact(users_csv_path='users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)

    # Filter out users without bios
    df = df[df['bio'].notna()]

    # Calculate the length of each bio in words
    df['bio_word_count'] = df['bio'].apply(lambda x: len(x.split()))

    # Define the dependent and independent variables
    X = df['bio_word_count']  # Independent variable
    y = df['followers']        # Dependent variable

    # Add a constant to the independent variable for the regression model
    X = sm.add_constant(X)

    # Fit the regression model
    model = sm.OLS(y, X).fit()

    # Get the slope (coefficient for bio_word_count)
    slope = model.params['bio_word_count']

    return round(slope, 3)

# Calculate the regression slope
slope_result = analyze_bio_impact()
print(f"Regression slope of followers on bio word count: {slope_result:.3f}")


Regression slope of followers on bio word count: 3.047


In [None]:
#14
import pandas as pd

def top_weekend_repository_creators(users_csv_path='repositories.csv'):
    # Read the repositories data
    df = pd.read_csv(users_csv_path)

    # Convert 'created_at' to datetime
    df['created_at'] = pd.to_datetime(df['created_at'])

    # Filter for weekend days (Saturday and Sunday)
    weekend_repos = df[df['created_at'].dt.dayofweek >= 5]

    # Count repositories created by each user
    top_users = weekend_repos['login'].value_counts().head(5)

    # Get the top 5 users' logins in order
    return ', '.join(top_users.index)

# Get the top users
top_users_logins = top_weekend_repository_creators()
print(f"Top 5 users who created the most repositories on weekends: {top_users_logins}")


Top 5 users who created the most repositories on weekends: marwahaha, eddelbuettel, sabre1041, erichilarysmithsr, yyolk


In [None]:
#15
import pandas as pd

def analyze_email_sharing(users_csv_path='users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)

    # Calculate the total number of hireable users and those with email addresses
    hireable_total = df[df['hireable'] == True].shape[0]
    hireable_with_email = df[(df['hireable'] == True) & (df['email'].notna())].shape[0]

    # Calculate the total number of non-hireable users and those with email addresses
    non_hireable_total = df[df['hireable'] != True].shape[0]
    non_hireable_with_email = df[(df['hireable'] != True) & (df['email'].notna())].shape[0]

    # Calculate fractions
    hireable_email_fraction = hireable_with_email / hireable_total if hireable_total > 0 else 0
    non_hireable_email_fraction = non_hireable_with_email / non_hireable_total if non_hireable_total > 0 else 0

    # Calculate the difference rounded to 3 decimal places
    difference = round(hireable_email_fraction - non_hireable_email_fraction, 3)

    return difference

# Calculate the difference in email sharing
email_sharing_difference = analyze_email_sharing()
print(f"Difference in email sharing: {email_sharing_difference:.3f}")


Difference in email sharing: 0.060


In [2]:
#16
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv("users.csv")

# Filter out rows with missing names
names_with_surnames = users_df.dropna(subset=['name'])

# Extract the last word in each name as the surname
names_with_surnames['surname'] = names_with_surnames['name'].apply(lambda x: x.strip().split()[-1])

# Count the frequency of each surname
surname_counts = names_with_surnames['surname'].value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Get all surnames with the maximum count
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

# Sort the surnames alphabetically if there are ties
most_common_surnames.sort()

# Join the surnames in a comma-separated string
result = ", ".join(most_common_surnames)

print("Most common surname(s):", result)


Most common surname(s): Smith


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  names_with_surnames['surname'] = names_with_surnames['name'].apply(lambda x: x.strip().split()[-1])
