In [1]:
import sys, pandas as pd, numpy as np
print(sys.version)
print("pandas:", pd.__version__)
print("numpy:", np.__version__)


3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]
pandas: 2.3.2
numpy: 2.3.3


In [3]:

# Imports
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 100)

# Load dataset (train + test)
cols = [
    'age','workclass','fnlwgt','education','education-num','marital-status',
    'occupation','relationship','race','sex','capital-gain','capital-loss',
    'hours-per-week','native-country','income'
]

train = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    names=cols, na_values=["?"], skipinitialspace=True
)
test = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
    names=cols, na_values=["?"], skiprows=1, skipinitialspace=True
)
test["income"] = test["income"].astype(str).str.replace(".", "", regex=False).str.strip()
data = pd.concat([train, test], ignore_index=True)

# Q1
num_columns = data.shape[1]

# Q2
missing_per_col = data.isna().sum().sort_values(ascending=False)

# Q3
unique_race = data["race"].nunique(dropna=True)
race_values = data["race"].unique()

# Q4
median_hours = data["hours-per-week"].median()

# Q5
gt50 = data[data["income"] == ">50K"]
sex_counts_gt50 = gt50["sex"].value_counts(dropna=False)

# Q6 (fill missing with mode)
data_filled = data.copy()
for col in data_filled.columns:
    mode_val = data_filled[col].mode(dropna=True)
    if not mode_val.empty:
        data_filled[col] = data_filled[col].fillna(mode_val.iloc[0])
remaining_missing = data_filled.isna().sum().sum()

# Print answers
print("1) Number of columns:", num_columns)
print("\n2) Missing values per column:")
print(missing_per_col[missing_per_col > 0])
print("\n3) Number of unique values in race:", unique_race)
print("   Values:", race_values)
print("\n4) Median hours-per-week:", median_hours)
print("\n5) Gender distribution for income >50K:")
print(sex_counts_gt50)
print("   Answer: More", sex_counts_gt50.idxmax())
print("\n6) After filling with mode, total missing values:", remaining_missing)
print("   Other options: mean/median for numbers, or 'Unknown' for categories.")



1) Number of columns: 15

2) Missing values per column:
occupation        2809
workclass         2799
native-country     857
dtype: int64

3) Number of unique values in race: 5
   Values: ['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']

4) Median hours-per-week: 40.0

5) Gender distribution for income >50K:
sex
Male      9918
Female    1769
Name: count, dtype: int64
   Answer: More Male

6) After filling with mode, total missing values: 0
   Other options: mean/median for numbers, or 'Unknown' for categories.
