In [4]:
!pip install ucimlrepo


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [9]:
from ucimlrepo import fetch_ucirepo

adult = fetch_ucirepo(id=2)

# dataframes
df = adult.data.features

# if you want the salary (target) column
df['salary'] = adult.data.targets


In [10]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

def calculate_demographic_data(print_data=True):
    # fetch dataset from UCI repo
    adult = fetch_ucirepo(id=2)

    # get dataframes
    df = adult.data.features
    df['salary'] = adult.data.targets

    # Q1: How many people of each race are represented?
    race_count = df['race'].value_counts()

    # Q2: What is the average age of men?
    average_age_men = round(df[df['sex'] == 'Male']['age'].mean(), 1)

    # Q3: What is the percentage of people with a Bachelor's degree?
    percentage_bachelors = round(
        (df['education'] == 'Bachelors').mean() * 100, 1
    )

    # advanced education filter
    advanced_edu = ['Bachelors', 'Masters', 'Doctorate']
    higher_edu = df[df['education'].isin(advanced_edu)]
    lower_edu = df[~df['education'].isin(advanced_edu)]

    # Q4: % higher ed making >50K
    higher_edu_rich = round(
        (higher_edu['salary'] == '>50K').mean() * 100, 1
    )

    # Q5: % lower ed making >50K
    lower_edu_rich = round(
        (lower_edu['salary'] == '>50K').mean() * 100, 1
    )

    # Q6: What is the minimum hours per week?
    min_work_hours = df['hours-per-week'].min()

    # Q7: % min workers who earn >50K
    min_workers = df[df['hours-per-week'] == min_work_hours]
    rich_min_workers = round(
        (min_workers['salary'] == '>50K').mean() * 100, 1
    )

    # Q8: Which country has the highest % of rich?
    rich_by_country = (
        df[df['salary'] == '>50K']['native-country']
        .value_counts(normalize=True) * 100
    )
    highest_earning_country = rich_by_country.idxmax()
    highest_earning_country_percentage = round(rich_by_country.max(), 1)

    # Q9: What is the most popular occupation for rich people in India?
    top_IN_occupation = (
        df[(df['native-country'] == 'India') & (df['salary'] == '>50K')]
        ['occupation']
        .value_counts()
        .idxmax()
    )

    # build the result dictionary
    result = {
        'race_count': race_count,
        'average_age_men': average_age_men,
        'percentage_bachelors': percentage_bachelors,
        'higher_education_rich': higher_edu_rich,
        'lower_education_rich': lower_edu_rich,
        'min_work_hours': min_work_hours,
        'rich_percentage_min_workers': rich_min_workers,
        'highest_earning_country': highest_earning_country,
        'highest_earning_country_percentage': highest_earning_country_percentage,
        'top_IN_occupation': top_IN_occupation
    }

    # optionally print while testing
    if print_data:
        for k, v in result.items():
            print(f"{k}:\n{v}\n")

    return result


In [12]:
from demographic_data_analyzer import calculate_demographic_data
calculate_demographic_data()


race_count:
race
White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: count, dtype: int64

average_age_men:
39.5

percentage_bachelors:
16.4

higher_education_rich:
30.9

lower_education_rich:
11.6

min_work_hours:
1

rich_percentage_min_workers:
7.4

highest_earning_country:
United-States

highest_earning_country_percentage:
91.5

top_IN_occupation:
Prof-specialty



{'race_count': race
 White                 41762
 Black                  4685
 Asian-Pac-Islander     1519
 Amer-Indian-Eskimo      470
 Other                   406
 Name: count, dtype: int64,
 'average_age_men': np.float64(39.5),
 'percentage_bachelors': np.float64(16.4),
 'higher_education_rich': np.float64(30.9),
 'lower_education_rich': np.float64(11.6),
 'min_work_hours': np.int64(1),
 'rich_percentage_min_workers': np.float64(7.4),
 'highest_earning_country': 'United-States',
 'highest_earning_country_percentage': np.float64(91.5),
 'top_IN_occupation': 'Prof-specialty'}