**Generate Git Hub Token**

In [29]:
# Replace YOUR_GITHUB_TOKEN_HERE with the actual token you generated
%env GITHUB_TOKEN=


env: GITHUB_TOKEN=


In [30]:
import os

# Fetch the GitHub token from environment variables
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')

# Verify if the token is loaded correctly (optional)
if not GITHUB_TOKEN:
    raise ValueError("GitHub token not found. Please set the GITHUB_TOKEN environment variable.")
else:
    print("GitHub token is loaded.")


ValueError: GitHub token not found. Please set the GITHUB_TOKEN environment variable.

In [None]:
import requests
import pandas as pd
import os
import time

# Fetch the GitHub token from environment variables
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')

if not GITHUB_TOKEN:
    raise ValueError("GitHub token not found. Set the GITHUB_TOKEN environment variable.")

HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}
USER_SEARCH_URL = 'https://api.github.com/search/users'
REPO_URL_TEMPLATE = 'https://api.github.com/users/{}/repos'

# Function to fetch all users in Dublin with over 50 followers, handling pagination
def get_users_in_dublin(min_followers=50):
    """
    Fetch GitHub users in Dublin with more than the specified number of followers,
    handling pagination to retrieve up to 500 users.
    """
    users = []
    params = {
        'q': f'location:Dublin followers:>{min_followers}',
        'per_page': 100,
        'page': 1
    }

    while True:
        response = requests.get(USER_SEARCH_URL, headers=HEADERS, params=params)

        if response.status_code != 200:
            print(f"Error: {response.status_code} - {response.json().get('message', 'Unknown error')}")
            break

        user_data = response.json()
        items = user_data.get('items', [])

        if not items:
            break

        for user in items:
            user_detail = requests.get(user['url'], headers=HEADERS).json()
            users.append({
                'login': user_detail.get('login', ''),
                'name': user_detail.get('name', ''),
                'company': user_detail.get('company', '').strip('@ ').upper() if user_detail.get('company') else '',
                'location': user_detail.get('location', ''),
                'email': user_detail.get('email', ''),
                'hireable': str(user_detail.get('hireable', False)).lower(),
                'bio': user_detail.get('bio', ''),
                'public_repos': user_detail.get('public_repos', 0),
                'followers': user_detail.get('followers', 0),
                'following': user_detail.get('following', 0),
                'created_at': user_detail.get('created_at', '')
            })

        params['page'] += 1
        print(f"Page {params['page'] - 1} fetched, total users so far: {len(users)}")
        time.sleep(1)

    return users

# Function to fetch all repositories for each user, with no limit
def get_repositories(user_logins):
    """
    Fetch all repositories for each user, handling pagination to retrieve every available repository.
    """
    repos = []
    for login in user_logins:
        page = 1
        while True:
            repo_url = f"{REPO_URL_TEMPLATE.format(login)}?per_page=100&page={page}"
            response = requests.get(repo_url, headers=HEADERS)

            if response.status_code != 200:
                print(f"Error fetching repositories for {login}: {response.status_code}")
                break

            user_repos = response.json()

            if not user_repos:
                break

            for repo in user_repos:
                license_info = repo.get('license')
                license_name = license_info.get('key', '') if license_info else ''

                repos.append({
                    'login': login,
                    'full_name': repo.get('full_name', ''),
                    'created_at': repo.get('created_at', ''),
                    'stargazers_count': repo.get('stargazers_count', 0),
                    'watchers_count': repo.get('watchers_count', 0),
                    'language': repo.get('language', ''),
                    'has_projects': str(repo.get('has_projects', False)).lower(),
                    'has_wiki': str(repo.get('has_wiki', False)).lower(),
                    'license_name': license_name
                })

            page += 1
            print(f"Page {page} of repositories for {login} fetched.")
            time.sleep(1)

    return repos

# Main function to execute both user and repository data fetching and saving to CSV
def main():
    # Step 1: Fetch all users in Dublin with over 50 followers
    print("Fetching users in Dublin with over 50 followers...")
    users = get_users_in_dublin()
    user_logins = [user['login'] for user in users]

    # Step 2: Fetch repositories for each user
    print("Fetching repositories for each user...")
    repos = get_repositories(user_logins)

    # Step 3: Save user data to users.csv
    users_df = pd.DataFrame(users)
    users_df.to_csv('users.csv', index=False)
    print("User data saved to users.csv")

    # Step 4: Save repository data to repositories.csv
    repos_df = pd.DataFrame(repos)
    repos_df.to_csv('repositories.csv', index=False)
    print("Repository data saved to repositories.csv")

if __name__ == '__main__':
    main()


Fetching users in Dublin with over 50 followers...
Page 1 fetched, total users so far: 100
Page 2 fetched, total users so far: 200
Page 3 fetched, total users so far: 300
Page 4 fetched, total users so far: 400
Page 5 fetched, total users so far: 477
Fetching repositories for each user...
Page 2 of repositories for orta fetched.
Page 3 of repositories for orta fetched.
Page 4 of repositories for orta fetched.
Page 5 of repositories for orta fetched.
Page 6 of repositories for orta fetched.
Page 7 of repositories for orta fetched.
Page 8 of repositories for orta fetched.
Page 9 of repositories for orta fetched.
Page 10 of repositories for orta fetched.
Page 11 of repositories for orta fetched.
Page 2 of repositories for jeromeetienne fetched.
Page 3 of repositories for jeromeetienne fetched.
Page 4 of repositories for jeromeetienne fetched.
Page 5 of repositories for jeromeetienne fetched.
Page 2 of repositories for jonataslaw fetched.
Page 3 of repositories for jonataslaw fetched.
Page

In [None]:
import pandas as pd

def preview_data(users_file, repos_file):
    # Load the CSV files into DataFrames
    users_df = pd.read_csv(users_file)
    repos_df = pd.read_csv(repos_file)

    # Display information about users.csv
    print("Preview of users.csv:")
    print(users_df.head())  # Show first 5 rows
    print("\nData types in users.csv:")
    print(users_df.dtypes)  # Show column data types
    print("\nNumber of rows and columns in users.csv:", users_df.shape)
    print("=" * 40)  # Separator for readability

    # Display information about repositories.csv
    print("Preview of repositories.csv:")
    print(repos_df.head())  # Show first 5 rows
    print("\nData types in repositories.csv:")
    print(repos_df.dtypes)  # Show column data types
    print("\nNumber of rows and columns in repositories.csv:", repos_df.shape)
    print("=" * 40)

    # Check for missing values in each file
    print("Missing values in users.csv:")
    print(users_df.isnull().sum())
    print("\nMissing values in repositories.csv:")
    print(repos_df.isnull().sum())

if __name__ == '__main__':
    users_file = 'users.csv'  # Path to your users.csv file
    repos_file = 'repositories.csv'  # Path to your repositories.csv file
    preview_data(users_file, repos_file)


Preview of users.csv:
                  login                    name  \
0                  orta             Orta Therox   
1         jeromeetienne          Jerome Etienne   
2            jonataslaw            Jonny Borges   
3  steventroughtonsmith  Steven Troughton-Smith   
4                  axic        Alex Beregszaszi   

                                   company  \
0                                      NaN   
1                   MAKING WEBAR A REALITY   
2                                     IRIS   
3                    HIGH CAFFEINE CONTENT   
4  ETHEREUM @IPSILON @SPEARBIT @ETHEREUMJS   

                                       location                     email  \
0  Huddersfield / NYC / Dublin / Rio de Janeiro               git@orta.io   
1                               Dublin, Ireland  jerome.etienne@gmail.com   
2                               Dublin, Ireland                       NaN   
3                               Dublin, Ireland                       NaN   
4        

Q1: Who are the top 5 users in Dublin with the highest number of followers? List their login in order, comma-separated.
Users

In [None]:
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv')

# Filter users based in Dublin
dublin_users = users_df[users_df['location'].str.contains('Dublin', na=False)]

# Sort by number of followers in descending order and select the top 5
top_dublin_users = dublin_users.sort_values(by='followers', ascending=False).head(5)

# Get the login names of the top 5 users
top_logins = top_dublin_users['login'].tolist()

# Join the logins into a comma-separated string
result = ','.join(top_logins)

# Output the result
print("Top 5 users in Dublin with the highest number of followers:")
print(result)


Top 5 users in Dublin with the highest number of followers:
orta,jeromeetienne,jonataslaw,steventroughtonsmith,axic


Q2: Who are the 5 earliest registered GitHub users in Dublin? List their login in ascending order of created_at, comma-separated.
Users

In [None]:
import csv
from datetime import datetime

# Define the list to store users from Dublin
users_in_dublin = []

# Read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        location = row['location'].strip().lower()
        # Check if the user is from Dublin
        if 'dublin' in location:
            users_in_dublin.append({
                'login': row['login'],
                'created_at': datetime.strptime(row['created_at'], '%Y-%m-%dT%H:%M:%SZ')
            })

# Sort users based on created_at in ascending order
sorted_users = sorted(users_in_dublin, key=lambda x: x['created_at'])

# Extract the top 5 earliest user logins
top_5_earliest_logins = [user['login'] for user in sorted_users[:5]]

# Print the result as a comma-separated list
print(','.join(top_5_earliest_logins))


paulca,adrian,GavinJoyce,amir,ciaranlee


Q3: What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.
Licenses

In [None]:
import csv
from collections import Counter

# List to store license names
licenses = []

# Open the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Get license_name, ignoring any missing or empty entries
        license_name = row.get('license_name', '').strip()
        if license_name:
            licenses.append(license_name)

# Count the occurrence of each license
license_counts = Counter(licenses)

# Find the 3 most common licenses
top_3_licenses = [license for license, _ in license_counts.most_common(3)]

# Print the result as a comma-separated list
print(','.join(top_3_licenses))


mit,apache-2.0,other


Q4: Which company do the majority of these developers work at?
Company (cleaned up as explained above)

In [9]:
import csv
from collections import Counter

# List to store company names
companies = []

# Open the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Get and clean up the company field (ignore empty values)
        company = row.get('company', '').strip()
        if company:
            companies.append(company)

# Count occurrences of each company
company_counts = Counter(companies)

# Identify the company with the highest count (most employees)
most_common_company = company_counts.most_common(1)

# Print the result
if most_common_company:
    print(most_common_company[0][0])  # Print only the company name
else:
    print("No company data found.")


MICROSOFT


Q5: Which programming language is most popular among these users?
Language

In [10]:
import csv
from collections import Counter

# List to store programming languages
languages = []

# Open the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Get and clean up the language field (ignore empty values)
        language = row.get('language', '').strip()
        if language:
            languages.append(language)

# Count occurrences of each language
language_counts = Counter(languages)

# Find the most common language
most_common_language = language_counts.most_common(1)

# Print the result
if most_common_language:
    print(most_common_language[0][0])  # Print only the language name
else:
    print("No language data found.")


JavaScript


Q6: Which programming language is the second most popular among users who joined after 2020?
Language

In [12]:
import csv
from collections import Counter
from datetime import datetime

# List to store programming languages of users who joined after 2020
languages = []

# Open the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    # Iterate through each row in the CSV
    for row in reader:
        # Parse the created_at field and check the join date
        created_at = row.get('created_at', '').strip()
        if created_at:
            user_join_date = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ")

            # Check if the user joined after 2020
            if user_join_date.year > 2020:
                # Get and clean the language field
                language = row.get('language', '').strip()
                if language:
                    languages.append(language)

# Count occurrences of each language
language_counts = Counter(languages)

# Find the two most common languages
most_common_languages = language_counts.most_common(2)

# Print the second most common language
if len(most_common_languages) >= 2:
    print(most_common_languages[1][0])  # Second most common language
else:
    print("Not enough language data found.")


Python


Q7: Which language has the highest average number of stars per repository?
Language

In [13]:
import csv
from collections import defaultdict

# Dictionary to store total stars and repository count per language
language_stats = defaultdict(lambda: {'stars': 0, 'repos': 0})

# Read the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        # Get the language and stargazers_count fields
        language = row.get('language', '').strip()
        stars = row.get('stargazers_count', '0').strip()

        # Only process if language and stars are available and stars are numeric
        if language and stars.isdigit():
            language_stats[language]['stars'] += int(stars)
            language_stats[language]['repos'] += 1

# Calculate the average stars per repository for each language
average_stars_per_language = {
    language: stats['stars'] / stats['repos']
    for language, stats in language_stats.items()
    if stats['repos'] > 0
}

# Find the language with the highest average stars
if average_stars_per_language:
    most_popular_language = max(average_stars_per_language, key=average_stars_per_language.get)
    print(most_popular_language)
else:
    print("No language data found.")


MDX


Q8: Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.
User login

In [14]:
import csv

# List to store users and their leader strength
leader_strengths = []

# Read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        # Get followers and following counts, converting them to integers
        followers = int(row.get('followers', '0').strip())
        following = int(row.get('following', '0').strip())

        # Calculate leader strength
        leader_strength = followers / (1 + following)

        # Store the user's login and their leader strength
        leader_strengths.append((row.get('login', '').strip(), leader_strength))

# Sort users by leader strength in descending order
leader_strengths.sort(key=lambda x: x[1], reverse=True)

# Get the top 5 users
top_5_leaders = [login for login, strength in leader_strengths[:5]]

# Print the result as a comma-separated list
print(','.join(top_5_leaders))


flaviohenriquealmeida,zalando,AnikSarker,wix,CardinalHealth


Q9: What is the correlation between the number of followers and the number of public repositories among users in Dublin?
Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [15]:
import csv
import numpy as np

# Lists to store the followers and public repos of users from Dublin
followers = []
public_repos = []

# Open the users.csv file and read data
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        # Filter for users in Dublin
        location = row.get('location', '').strip().lower()
        if "dublin" in location:
            # Get followers and public repositories values
            try:
                followers_count = int(row['followers'])
                public_repos_count = int(row['public_repos'])

                # Append the valid values to the lists
                followers.append(followers_count)
                public_repos.append(public_repos_count)
            except ValueError:
                # Skip rows with invalid numerical values
                continue

# Ensure there is data to compute correlation
if len(followers) > 1 and len(public_repos) > 1:
    # Compute Pearson correlation coefficient
    correlation_matrix = np.corrcoef(followers, public_repos)
    correlation = correlation_matrix[0, 1]
    # Output correlation rounded to 3 decimal places
    print(f"{correlation:.3f}")
else:
    print("Insufficient data for correlation calculation.")


0.556


Q10: Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.
Regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [16]:
import csv
import numpy as np

# Lists to store the followers and public repos of users
followers = []
public_repos = []

# Open the users.csv file and read data
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        # Get followers and public repositories values
        try:
            followers_count = int(row['followers'])
            public_repos_count = int(row['public_repos'])

            # Append the valid values to the lists
            followers.append(followers_count)
            public_repos.append(public_repos_count)
        except ValueError:
            # Skip rows with invalid numerical values
            continue

# Ensure there is data for regression
if len(followers) > 1 and len(public_repos) > 1:
    # Perform linear regression: followers ~ public_repos
    slope, intercept = np.polyfit(public_repos, followers, 1)

    # Output the slope rounded to 3 decimal places
    print(f"{slope:.3f}")
else:
    print("Insufficient data for regression.")


2.834


Q11: Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?
Correlation between projects and wiki enabled (to 3 decimal places, e.g. 0.123 or -0.123)

In [17]:
import pandas as pd
import numpy as np

def analyze_repo_features(csv_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    # Convert 'has_projects' and 'has_wiki' columns to boolean if they are strings
    if df['has_projects'].dtype == 'object':
        df['has_projects'] = df['has_projects'].map({'true': True, 'false': False})
    if df['has_wiki'].dtype == 'object':
        df['has_wiki'] = df['has_wiki'].map({'true': True, 'false': False})

    # Calculate the correlation between projects and wiki features
    correlation = df['has_projects'].corr(df['has_wiki'])

    # Calculate additional statistics
    stats = {
        'total_repos': len(df),
        'projects_enabled': df['has_projects'].sum(),
        'wiki_enabled': df['has_wiki'].sum(),
        'both_enabled': ((df['has_projects']) & (df['has_wiki'])).sum(),
        'neither_enabled': ((~df['has_projects']) & (~df['has_wiki'])).sum()
    }

    return round(correlation, 3), stats

# Run the analysis function
correlation, stats = analyze_repo_features('repositories.csv')
print(f"Correlation coefficient: {correlation}")
print("\nAdditional Statistics:")
for key, value in stats.items():
    print(f"{key}: {value}")


Correlation coefficient: 0.315

Additional Statistics:
total_repos: 29251
projects_enabled: 28697
wiki_enabled: 25045
both_enabled: 25011
neither_enabled: 520


Q12: Do hireable users follow more people than those who are not hireable?
Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

In [18]:
import pandas as pd

def analyze_following_difference(users_csv_path='users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)

    # Calculate average following for hireable users
    hireable_following = df[df['hireable'] == True]['following'].mean()

    # Calculate average following for non-hireable users
    non_hireable_following = df[df['hireable'] != True]['following'].mean()

    # Calculate the difference rounded to 3 decimal places
    difference = round(hireable_following - non_hireable_following, 3)

    # Print debug information
    print(f"Number of hireable users: {len(df[df['hireable'] == True])}")
    print(f"Number of non-hireable users: {len(df[df['hireable'] != True])}")
    print(f"Average following for hireable users: {hireable_following:.3f}")
    print(f"Average following for non-hireable users: {non_hireable_following:.3f}")

    return difference

# Calculate the difference
result = analyze_following_difference()
print(f"\nDifference in average following: {result:.3f}")


Number of hireable users: 0
Number of non-hireable users: 477
Average following for hireable users: nan
Average following for non-hireable users: 83.631

Difference in average following: nan


Q13: Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)
Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

In [19]:
import pandas as pd
import statsmodels.api as sm

def analyze_bio_impact_on_followers(csv_file='users.csv'):
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_file)

    # Filter out users without bios
    df = df[df['bio'].notnull()]

    # Calculate the length of each bio in words (splitting by whitespace)
    df['bio_word_count'] = df['bio'].str.split().str.len()

    # Prepare the independent variable (X) and dependent variable (y)
    X = df['bio_word_count']
    y = df['followers']  # Adjust the column name as per your dataset if needed

    # Add a constant to the independent variable (for the intercept)
    X = sm.add_constant(X)

    # Fit the regression model
    model = sm.OLS(y, X).fit()

    # Get the slope (coefficient of the bio_word_count)
    slope = model.params['bio_word_count']

    # Print the regression slope rounded to three decimal places
    print(f"\nRegression slope of followers on bio word count: {slope:.3f}")

# Run the analysis
analyze_bio_impact_on_followers()



Regression slope of followers on bio word count: 7.550


Q14: Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated
Users login

In [20]:
import csv
from collections import Counter
from datetime import datetime

# Counter to store the number of repositories created by each user on weekends
weekend_repo_counts = Counter()

# Open the repositories.csv file and read data
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        user_login = row.get('login', '')  # Get the user's login

        if created_at and user_login:  # Ensure both fields are present
            # Convert created_at string to a datetime object (in UTC)
            created_date = datetime.fromisoformat(created_at[:-1])  # Remove 'Z' and convert

            # Check if the day is Saturday (5) or Sunday (6)
            if created_date.weekday() in [5, 6]:
                weekend_repo_counts[user_login] += 1  # Increment the count for the user

# Get the top 5 users who created the most repositories on weekends
top_users = weekend_repo_counts.most_common(5)

# Extract the logins of the top users
top_logins = [user[0] for user in top_users]

# Output the top users' logins as a comma-separated string
print(','.join(top_logins))


orta,joshuacassidy,No9,wafuwafu13,lmammino


Q15: Do people who are hireable share their email addresses more often?
[fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

In [21]:
import pandas as pd

def analyze_email_sharing(users_csv_path='users.csv'):
    # Read the complete CSV file
    df = pd.read_csv(users_csv_path)

    # Convert email column to boolean (True if email exists, False if NaN or empty)
    df['has_email'] = df['email'].notna() & (df['email'] != '')

    # Calculate the fraction of hireable users who share their email
    hireable_mask = df['hireable'] == True
    if hireable_mask.any():
        hireable_email_fraction = df[hireable_mask]['has_email'].mean()
    else:
        hireable_email_fraction = 0

    # Calculate the fraction of non-hireable users who share their email
    non_hireable_mask = df['hireable'] != True
    if non_hireable_mask.any():
        non_hireable_email_fraction = df[non_hireable_mask]['has_email'].mean()
    else:
        non_hireable_email_fraction = 0

    # Calculate difference and round to 3 decimal places
    difference = round(hireable_email_fraction - non_hireable_email_fraction, 3)

    # Print debug information
    print(f"Total users: {len(df)}")
    print(f"Hireable users with email: {df[hireable_mask]['has_email'].sum()}/{hireable_mask.sum()}")
    print(f"Non-hireable users with email: {df[non_hireable_mask]['has_email'].sum()}/{non_hireable_mask.sum()}")
    print(f"Hireable fraction: {hireable_email_fraction:.3f}")
    print(f"Non-hireable fraction: {non_hireable_email_fraction:.3f}")

    return difference

# Read and analyze the complete dataset
result = analyze_email_sharing()
print(f"\nDifference in email sharing fraction: {result:.3f}")


Total users: 477
Hireable users with email: 0/0
Non-hireable users with email: 236/477
Hireable fraction: 0.000
Non-hireable fraction: 0.495

Difference in email sharing fraction: -0.495


Q16: Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)
Most common surname(s)

In [23]:
import csv
from collections import Counter

# Counter to store surname frequencies
surname_counter = Counter()

# Open the users.csv file and read data
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        name = row.get('name', '').strip()
        if name:  # Ignore missing names
            # Split the name by whitespace and get the last word as the surname
            surname = name.split()[-1]
            surname_counter[surname] += 1

# Find the maximum frequency of surnames
if surname_counter:
    max_count = max(surname_counter.values())
    # Get all surnames with the maximum frequency
    most_common_surnames = [surname for surname, count in surname_counter.items() if count == max_count]
    # Sort surnames alphabetically
    most_common_surnames.sort()
    # Output the result
    print(f"{','.join(most_common_surnames)}: {max_count}")
else:
    print("No names found.")


Chen,Kenny,O'Sullivan,Quinn: 3
