Code for User.csv

In [None]:
import requests
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from requests.exceptions import RequestException
from urllib3.util.retry import Retry

# GitHub API endpoint and headers
GITHUB_API_URL = "https://api.github.com"
GITHUB_TOKEN = "Replace with your GitHub token"  # Replace with your GitHub token
HEADERS = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

# Session with retry and backoff
session = requests.Session()
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.headers.update(HEADERS)

def fetch_sydney_users(min_followers=100):
    """Fetch GitHub users in Sydney with more than the specified number of followers."""
    users = []
    page = 1

    while True:
        try:
            url = f"{GITHUB_API_URL}/search/users?q=location:Sydney+followers:>{min_followers}&page={page}&per_page=100"
            response = session.get(url, timeout=10)

            if response.status_code != 200:
                print(f"Error: {response.status_code} - {response.text}")
                break

            data = response.json()
            if not data['items']:
                break

            for user in data['items']:
                user_details = fetch_user_details(user["login"], user["url"])
                if user_details:
                    users.append(user_details)

            page += 1
            if page > 10:
                break

            time.sleep(1)

        except RequestException as e:
            print(f"Request failed: {e}")
            break

    return users

def fetch_user_details(login, user_detail_url):
    """Fetch detailed information for a specific user."""
    response = session.get(user_detail_url, timeout=10)
    if response.status_code == 200:
        user_data = response.json()
        return {
            "login": user_data.get("login", ""),
            "name": user_data.get("name", ""),
            "company": (user_data.get("company") or "").lstrip("@").upper().strip(),
            "location": user_data.get("location", ""),
            "email": user_data.get("email", ""),
            "hireable": user_data.get("hireable", ""),
            "bio": user_data.get("bio", ""),
            "public_repos": user_data.get("public_repos", 0),
            "followers": user_data.get("followers", 0),
            "following": user_data.get("following", 0),
            "created_at": user_data.get("created_at", "")
        }
    else:
        error_message = response.json().get('message', 'No additional error info')
        print(f"Failed to fetch details for {login}: {response.status_code} - {error_message}")
    return None

# Fetch users and save to 'users.csv'
sydney_users = fetch_sydney_users()
df_users = pd.DataFrame(sydney_users)
df_users.to_csv("users.csv", index=False)
print("Detailed user data saved to users.csv successfully.")


Detailed user data saved to users.csv successfully.


Code to check validate_company

In [None]:
import pandas as pd

# Load users from the CSV file
users_df = pd.read_csv("users.csv")

# Function to check and clean up company names
def validate_company(company):
    if isinstance(company, str):  # Ensure company is a string
        # Trim whitespace and clean company name
        cleaned_company = company.strip().lstrip('@').upper()
        return cleaned_company
    return ""

# List to store errors
errors = []

# Validate and report on company data
for index, row in users_df.iterrows():
    original_company = row["company"]
    cleaned_company = validate_company(original_company)

    if cleaned_company != original_company:
        errors.append(f"Company discrepancy for user {row['login']}: Original: '{original_company}', Cleaned: '{cleaned_company}'")

# Print errors or confirmation message
if errors:
    for error in errors:
        print(error)
else:
    print("No errors found.")


Company discrepancy for user nicknochnack: Original: 'nan', Cleaned: ''
Company discrepancy for user 0vm: Original: 'nan', Cleaned: ''
Company discrepancy for user dragen1860: Original: 'nan', Cleaned: ''
Company discrepancy for user Canva: Original: 'nan', Cleaned: ''
Company discrepancy for user jpillora: Original: 'nan', Cleaned: ''
Company discrepancy for user alecthomas: Original: 'nan', Cleaned: ''
Company discrepancy for user johndpope: Original: 'nan', Cleaned: ''
Company discrepancy for user djnavarro: Original: 'nan', Cleaned: ''
Company discrepancy for user JayZeeDesign: Original: 'nan', Cleaned: ''
Company discrepancy for user tamim: Original: 'nan', Cleaned: ''
Company discrepancy for user stevemao: Original: 'nan', Cleaned: ''
Company discrepancy for user cironunes: Original: 'nan', Cleaned: ''
Company discrepancy for user matthewpalmer: Original: 'nan', Cleaned: ''
Company discrepancy for user ndleah: Original: 'nan', Cleaned: ''
Company discrepancy for user MaikuB: Orig

#more cleaned users.csv

In [None]:
#more cleaned users.csv
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')

# Clean the company names
users_df['company'] = users_df['company'].str.strip()  # Trim whitespace
users_df['company'] = users_df['company'].str.lstrip('@')  # Strip leading '@'
users_df['company'] = users_df['company'].str.upper()  # Convert to uppercase

# Save the cleaned DataFrame back to users.csv
users_df.to_csv('users.csv', index=False)

print("Company names cleaned and saved to users.csv.")


Company names cleaned and saved to users.csv.


Download User.csv

In [None]:
from google.colab import files

# Download users.csv
files.download("users.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Code for repositories.csv

In [None]:
import requests
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from requests.exceptions import RequestException
from urllib3.util.retry import Retry

# GitHub API endpoint and headers
GITHUB_API_URL = "https://api.github.com"
GITHUB_TOKEN = "Replace with your GitHub token"  # Replace with your GitHub token
HEADERS = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

# Session with retry and backoff
session = requests.Session()
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.headers.update(HEADERS)

def fetch_user_repositories(login):
    """Fetch public repositories for a given user."""
    repos = []
    page = 1
    while True:
        try:
            url = f"{GITHUB_API_URL}/users/{login}/repos?page={page}&per_page=100"
            response = session.get(url, timeout=10)

            if response.status_code != 200:
                print(f"Error fetching repos for {login}: {response.status_code} - {response.text}")
                break

            repo_data = response.json()
            if not repo_data:
                break

            for repo in repo_data:
                repos.append({
                    "login": login,
                    "full_name": repo.get("full_name", ""),
                    "created_at": repo.get("created_at", ""),
                    "stargazers_count": repo.get("stargazers_count", 0),
                    "watchers_count": repo.get("watchers_count", 0),
                    "language": repo.get("language", ""),
                    "has_projects": repo.get("has_projects", False),
                    "has_wiki": repo.get("has_wiki", False),
                    "license_name": repo.get("license").get("key", "") if repo.get("license") else ""
                })
            page += 1
            time.sleep(1)  # To avoid hitting the rate limit
        except RequestException as e:
            print(f"Request failed for {login}: {e}")
            break

    return repos

# Read users from 'users.csv'
users_df = pd.read_csv("users.csv")

# Initialize a list to store repository data
all_repos = []

# Loop through each user and fetch their repositories
for index, row in users_df.iterrows():
    login = row["login"]
    print(f"Fetching repositories for user: {login}")
    user_repos = fetch_user_repositories(login)
    all_repos.extend(user_repos)  # Append fetched repositories

# Save repositories to 'repositories.csv'
df_repositories = pd.DataFrame(all_repos)
df_repositories.to_csv("repositories.csv", index=False)
print("Repository data saved to repositories.csv successfully.")


Fetching repositories for user: nicknochnack
Fetching repositories for user: brendangregg
Fetching repositories for user: cornflourblue
Fetching repositories for user: 0vm
Fetching repositories for user: davecheney
Fetching repositories for user: JedWatson
Fetching repositories for user: dragen1860
Fetching repositories for user: thombergs
Fetching repositories for user: DmitryBaranovskiy
Fetching repositories for user: redguardtoo
Fetching repositories for user: sanjay-kv
Fetching repositories for user: Canva
Fetching repositories for user: orhanobut
Fetching repositories for user: tonybaloney
Fetching repositories for user: dmytrodanylyk
Fetching repositories for user: MaryamAustralia
Fetching repositories for user: haskellcamargo
Fetching repositories for user: jpillora
Fetching repositories for user: adg
Fetching repositories for user: SamSaffron
Fetching repositories for user: alecthomas
Fetching repositories for user: johndpope
Fetching repositories for user: djnavarro
Fetching r

Code to Download repositories.csv from Google Colab

In [None]:
from google.colab import files

# Assuming 'repositories.csv' is the file you want to download
file_name = 'repositories.csv'  # Change this to your file name if needed

# Download the file
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

1. Who are the top 5 users in Sydney with the highest number of followers? List their login in order, comma-separated.

In [None]:
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')

# Filter users in Sydney and sort by followers
top_users = users_df[users_df['location'].str.contains('Sydney', case=False)]
top_users = top_users.sort_values(by='followers', ascending=False)

# Get the top 5 users
top_5_users = top_users.head(5)['login'].tolist()

# Print the logins in order, comma-separated
print(", ".join(top_5_users))


nicknochnack, brendangregg, cornflourblue, 0vm, davecheney


2. Who are the 5 earliest registered GitHub users in Sydney? List their login in ascending order of created_at, comma-separated.


In [21]:
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')

# Filter users in Sydney and sort by created_at
sydney_users = users_df[users_df['location'].str.contains('Sydney', case=False)]
earliest_users = sydney_users.sort_values(by='created_at')

# Get the 5 earliest registered users
earliest_5_users = earliest_users.head(5)['login'].tolist()

# Print the logins in ascending order, comma-separated
print(", ".join(earliest_5_users))


dylanegan, cjheath, freshtonic, dhowden, mikel


3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [22]:
import pandas as pd

# Load the repository data
repos_df = pd.read_csv('repositories.csv')

# Filter out rows with missing license names
valid_licenses = repos_df[repos_df['license_name'] != ""]

# Count the occurrences of each license and get the top 3
top_licenses = valid_licenses['license_name'].value_counts().head(3)

# Print the license names in order, comma-separated
print(", ".join(top_licenses.index))


mit, other, apache-2.0


4. Which company do the majority of these developers work at?
Company (cleaned up as explained above)

In [23]:
import pandas as pd

# Load the cleaned user data
users_df = pd.read_csv('users.csv')

# Count occurrences of each company
company_counts = users_df['company'].value_counts()

# Get the most common company
most_common_company = company_counts.idxmax()
most_common_count = company_counts.max()

print(f"The majority of developers work at: {most_common_company} (Count: {most_common_count})")


The majority of developers work at: ATLASSIAN (Count: 19)


***5***. Which programming language is most popular among these users?
Language

In [24]:
import pandas as pd

# Load the repository data
repos_df = pd.read_csv('repositories.csv')

# Count occurrences of each programming language
language_counts = repos_df['language'].value_counts()

# Get the most popular language
most_popular_language = language_counts.idxmax()
most_popular_count = language_counts.max()

print(f"The most popular programming language among these users is: {most_popular_language} (Count: {most_popular_count})")


The most popular programming language among these users is: JavaScript (Count: 6135)


6. Which programming language is the second most popular among users who joined after 2020?
Language

In [25]:
import pandas as pd

# Load the user data
users_df = pd.read_csv('users.csv')

# Load the repository data
repos_df = pd.read_csv('repositories.csv')

# Convert the 'created_at' column to datetime
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Filter users who joined after 2020
users_after_2020 = users_df[users_df['created_at'] > '2020-01-01']

# Get logins of filtered users
logins_after_2020 = users_after_2020['login'].tolist()

# Filter repositories for those users
filtered_repos = repos_df[repos_df['login'].isin(logins_after_2020)]

# Count occurrences of each programming language
language_counts = filtered_repos['language'].value_counts()

# Get the second most popular language
second_most_popular_language = language_counts.index[1] if len(language_counts) > 1 else None
second_most_popular_count = language_counts.iloc[1] if len(language_counts) > 1 else 0

print(f"The second most popular programming language among users who joined after 2020 is: {second_most_popular_language} (Count: {second_most_popular_count})")


The second most popular programming language among users who joined after 2020 is: TypeScript (Count: 22)


7. Which language has the highest average number of stars per repository?
Language

In [26]:
import pandas as pd

# Load the repository data
repos_df = pd.read_csv('repositories.csv')

# Group by language and calculate the average number of stars
average_stars = repos_df.groupby('language')['stargazers_count'].mean()

# Identify the language with the highest average number of stars
highest_avg_stars_language = average_stars.idxmax()
highest_avg_stars_value = average_stars.max()

print(f"The language with the highest average number of stars per repository is: {highest_avg_stars_language} (Average Stars: {highest_avg_stars_value:.2f})")


The language with the highest average number of stars per repository is: Mermaid (Average Stars: 505.00)


8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.
User login

In [27]:
import pandas as pd

# Load the user data
users_df = pd.read_csv('users.csv')

# Calculate leader strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Get the top 5 users by leader strength
top_leaders = users_df.nlargest(5, 'leader_strength')

# List the logins in order
top_leader_logins = top_leaders['login'].tolist()
top_leader_logins_str = ', '.join(top_leader_logins)

print(f"The top 5 users in terms of leader strength are: {top_leader_logins_str}")


The top 5 users in terms of leader strength are: brendangregg, cornflourblue, Canva, nicknochnack, 0vm


9. What is the correlation between the number of followers and the number of public repositories among users in Sydney?
Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [28]:
import pandas as pd

# Load the user data
users_df = pd.read_csv('users.csv')

# Calculate the correlation between followers and public repositories
correlation = users_df['followers'].corr(users_df['public_repos'])

# Print the correlation rounded to three decimal places
print(f"The correlation between the number of followers and the number of public repositories is: {correlation:.3f}")


The correlation between the number of followers and the number of public repositories is: 0.035


10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.
Regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [29]:
import pandas as pd
import statsmodels.api as sm

# Load the user data
users_df = pd.read_csv('users.csv')

# Define the independent variable (public_repos) and dependent variable (followers)
X = users_df['public_repos']
y = users_df['followers']

# Add a constant to the independent variable
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope for the public_repos variable
slope = model.params['public_repos']

# Print the slope rounded to three decimal places
print(f"The regression slope of followers on public repositories is: {slope:.3f}")


The regression slope of followers on public repositories is: 0.068


11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?
Correlation between projects and wiki enabled (to 3 decimal places, e.g. 0.123 or -0.123)

In [44]:
import pandas as pd

# Load the repositories data
repos_df = pd.read_csv('repositories.csv')

# Create binary columns for projects and wiki enabled (1 for True, 0 for False)
repos_df['projects_enabled'] = repos_df['has_projects'].astype(int)
repos_df['wiki_enabled'] = repos_df['has_wiki'].astype(int)

# Calculate the correlation
correlation = repos_df['projects_enabled'].corr(repos_df['wiki_enabled'])

# Print the correlation rounded to three decimal places
print(f"Correlation between projects and wiki enabled: {correlation:.3f}")


Correlation between projects and wiki enabled: 0.220


12. Do hireable users follow more people than those who are not hireable?
Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

In [46]:
import pandas as pd

# Load the users data
users_df = pd.read_csv('users.csv')

# Calculate the average following for hireable users
hireable_avg = users_df[users_df['hireable'] == True]['following'].mean()

# Calculate the average following for non-hireable users
non_hireable_avg = users_df[users_df['hireable'] == False]['following'].mean()

# Calculate the difference
difference = hireable_avg - non_hireable_avg

# Print the result rounded to three decimal places
print(f"Average following difference: {difference:.3f}")


Average following difference: nan


13. Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)
Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

In [43]:
import pandas as pd
import statsmodels.api as sm

# Load the user data
users_df = pd.read_csv('users.csv')

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notna()]

# Calculate the length of each bio in words
users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().str.len()

# Define the dependent variable (followers) and independent variable (bio word count)
X = users_with_bios['bio_word_count']
y = users_with_bios['followers']

# Add a constant to the independent variable for the intercept
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope (coefficient for bio_word_count)
slope = model.params['bio_word_count']

# Print the regression slope rounded to three decimal places
print(f"The regression slope of followers on bio word count is: {slope:.3f}")


The regression slope of followers on bio word count is: -10.884


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().str.len()


14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated
Users login

In [35]:
import pandas as pd

# Load the repositories data
repos_df = pd.read_csv('repositories.csv')

# Convert 'created_at' to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for weekend days (Saturday and Sunday)
repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek
weekend_repos = repos_df[repos_df['day_of_week'].isin([5, 6])]  # 5 = Saturday, 6 = Sunday

# Count repositories created by each user
top_weekend_users = weekend_repos['login'].value_counts().head(5)

# Get the logins of the top users in order
top_users_logins = ', '.join(top_weekend_users.index)

print(f"Top 5 users who created the most repositories on weekends: {top_users_logins}")


Top 5 users who created the most repositories on weekends: johndpope, mvandermeulen, timgates42, mikeyhodl, pinkforest


15. Do people who are hireable share their email addresses more often?
[fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

In [41]:
import pandas as pd

# Load the users data
users_df = pd.read_csv('users.csv')

# Calculate the fraction of hireable users with an email
hireable_with_email = users_df[users_df['hireable'] == True]['email'].notna().mean()

# Calculate the fraction of non-hireable users with an email
non_hireable_with_email = users_df[users_df['hireable'] != True]['email'].notna().mean()

# Calculate the difference
email_difference = hireable_with_email - non_hireable_with_email

# Print the result rounded to three decimal places
print(f"Fraction of users with email difference: {email_difference:.3f}")


Fraction of users with email difference: 0.044


16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)
Most common surname(s)

In [37]:
import pandas as pd
from collections import Counter

# Load the users data
users_df = pd.read_csv('users.csv')

# Extract surnames from the 'name' column
surnames = []
for name in users_df['name'].dropna():  # Ignore missing names
    surname = name.strip().split()[-1]  # Take the last word as surname
    surnames.append(surname)

# Count occurrences of each surname
surname_counts = Counter(surnames)

# Find the maximum count
max_count = max(surname_counts.values())

# Get the most common surnames (in case of a tie)
most_common_surnames = [surname for surname, count in surname_counts.items() if count == max_count]

# Sort them alphabetically
most_common_surnames.sort()

# Convert to a comma-separated string
result = ', '.join(most_common_surnames)

print(f"Most common surname(s): {result}")


Most common surname(s): Wu, Zhang
