In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set(font_scale=1.5)

In [None]:
col_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "target"
]

NORTH_AMERICA = ["united-states", "mexico", "canada"]
CENTRAL_AMERICA = [
    "honduras",
    "haiti",
    "guatemala",
    "nicaragua",
    "jamaica",
    "cuba",
    "el-salvador",
    "puerto-rico",
    "dominican-republic"
]
SOUTH_AMERICA = ["trinadad&tobago", "ecuador", "peru", "columbia"]

EUROPE = [
    "holand-netherlands",
    "hungary",
    "scotland",
    "yugoslavia",
    "ireland",
    "france",
    "greece",
    "portugal",
    "poland",
    "italy",
    "england",
    "germany"
]

ASIA = [
    "laos",
    "cambodia",
    "hong",
    "thailand",
    "taiwan",
    "japan",
    "china",
    "india",
    "philippines",
    "iran",
    "vietnam"
]

In [None]:
df_train = pd.read_csv("../data/adult.data", names=col_names)
df_train["set"] = "train"

df_test = pd.read_csv("../data/adult.test", names=col_names)
df_test["set"] = "test"

df = pd.concat([df_train, df_test]).reset_index(drop=True)

del df_train, df_test

In [None]:
# When considering age ranges until 80 years, we group all upper outliers in the same category
def age_to_categorical(age):
    if age < 12:
        return 0
    elif age < 17:
        return 1
    elif age < 24:
        return 2
    elif age < 34:
        return 3
    elif age < 44:
        return 4
    elif age < 54:
        return 5
    elif age < 64:
        return 6
    elif age < 80:
        return 7
    return 8


def sex_to_categorical(sex):
    if sex == "male":
        return 1
    return 0


def hours_per_week_to_categorical(hours):
    if hours < 20:
        return 0
    elif hours <= 30:
        return 1
    elif hours <= 40:
        return 2
    elif hours <= 50:
        return 3
    elif hours <= 60:
        return 4
    return 5


def country_to_categorical(country):
    global NORTH_AMERICA, CENTRAL_AMERICA, SOUTH_AMERICA, EUROPE, ASIA
    
    if country in NORTH_AMERICA:
        return "north-america"
    elif country in CENTRAL_AMERICA:
        return "central-america"
    elif country in SOUTH_AMERICA:
        return "south-america"
    elif country in EUROPE:
        return "europe"
    elif country in ASIA:
        return "asia"
    return "unknown"

The dataset presents no null values

In [None]:
df.isna().sum().to_frame(name="Number of Null Values")

In [None]:
df = df.drop("education", axis=1)
df = df.rename(columns={"education-num":"education_level"})

As we can see below, the vast majority of the individuals are less than 80 years old. Hence, we'll group any person older than that in the same category.

In [None]:
plt.boxplot(df.age.values)

plt.title("Age Distribution - Removing Outliers")
plt.xticks([1], ["Ages"])
plt.show()

In [None]:
df["age"] = df.age.map(age_to_categorical)

As we can see, a great percentage of the data (>80%) regards white people. Hence, we can already expect the machine learning models to be developed to perform poorly on the remaining races due to underrepresentation.

In [None]:
race = df.race.value_counts(normalize=True).sort_values()
plt.barh(race.index, race.values)

plt.title("Race Distribution")
plt.xlabel("Percentage")
plt.ylabel("Race")
plt.show()

Transform race into categorical 1-hot representation

In [None]:
df["race"] = df.race.str.lower()
df["race"] = df.race.str.strip()

for race in df.race.unique():
    col_name = f"race-{race}"

    idx = df[df.race == race].index
    df[col_name] = 0
    
    df.loc[idx, col_name] = 1

df = df.drop("race", axis=1)

The percentage of males is twice as big as that of females. We can also expected some kind of umbalancing when predicting for that class

In [None]:
sex = df.sex.value_counts()
plt.pie(sex.values, labels=sex.index, autopct="%.2f")

plt.title("Sex Distribution")
plt.show()

In [None]:
df["sex"] = df.sex.map(sex_to_categorical)

Transform marital status into categorical 1-hot representation

In [None]:
df["marital-status"] = df["marital-status"].str.lower()
df["marital-status"] = df["marital-status"].str.strip()

for status in df["marital-status"].unique():
    idx = df[df["marital-status"] == status].index
    df[status] = 0
    
    df.loc[idx, status] = 1

df = df.drop("marital-status", axis=1)

Disregarding outliers, we see that the bulk of the data lies between [30, 60], with about 50% of the data under 40 work hours a week

In [None]:
hours = df["hours-per-week"]

plt.boxplot(hours)
plt.title("Work Hours per Week Distribution")
plt.xticks([1], ["Work Hours per Week"])
plt.show()

In [None]:
df["hours-per-week"] = df["hours-per-week"].map(hours_per_week_to_categorical)
df = df.rename(columns={"hours-per-week": "work-hours-per-week"})

In [None]:
df["occupation"] = df.occupation.str.strip()
df["occupation"] = df.occupation.str.lower()

df["occupation"] = df.occupation.apply(
    lambda occupation: "unknown" if occupation == "?" else occupation
)

In [None]:
occupation = df.occupation.value_counts(normalize=True).sort_values()

plt.barh(occupation.index, occupation.values)
plt.title("Occupation Distribution")
plt.show()

In [None]:
for occupation in df.occupation.unique():
    col_name = f"occupation-{occupation}"
    
    idx = df[df.occupation == occupation].index
    df[col_name] = 0
    
    df.loc[idx, col_name] = 1

df = df.drop("occupation", axis=1)

Converts capital gain and loss to a positive flag in their respetive columns

In [None]:
df["capital-gain"] = df["capital-gain"].apply(
    lambda value: 1 if value > 0 else 0
)

df["capital-loss"] = df["capital-loss"].apply(
    lambda value: 1 if value > 0 else 0
)

In [None]:
df["native-country"] = df["native-country"].str.strip()
df["native-country"] = df["native-country"].str.lower()

df["native-country"] = df["native-country"].map(country_to_categorical)

As we can see below, more than 90% of the data regards people from North America

In [None]:
df["native-country"].value_counts(normalize=True).to_frame(name="Percentage")

Transform native country into categorical 1-hot representation

In [None]:
for nationality in df["native-country"].unique():
    col_name = f"from-{nationality}"
    
    idx = df[df["native-country"] == nationality].index
    df[col_name] = 0
    
    df.loc[idx, col_name] = 1

df = df.drop("native-country", axis=1)

In [None]:
df["workclass"] = df.workclass.str.strip()
df["workclass"] = df.workclass.str.lower()

df["workclass"] = df.workclass.apply(
    lambda workclass: "unknown" if workclass == "?" else workclass
)

In [None]:
df.workclass.value_counts(normalize=True).to_frame("Percentage")

In [None]:
for workclass in df.workclass.unique():
    col_name = f"workclass-{workclass}"
    
    idx = df[df.workclass == workclass].index
    df[col_name] = 0
    
    df.loc[idx, col_name] = 1

df = df.drop("workclass", axis=1)

In [None]:
df["relationship"] = df.relationship.str.strip()
df["relationship"] = df.relationship.str.lower()

In [None]:
df.relationship.value_counts(normalize=True).to_frame("Percentage")

In [None]:
for relationship in df.relationship.unique():
    col_name = f"relationship-{relationship}"
    
    idx = df[df.relationship == relationship].index
    df[col_name] = 0
    
    df.loc[idx, col_name] = 1

df = df.drop("relationship", axis=1)

The target consists in whether a person makes more than US$ 50K a year or not

In [None]:
df["target"] = df.target.str.replace(".", "")
df["target"] = df.target.str.strip()

df["target"] = df.target.apply(
    lambda value: 1 if value == ">50K" else 0
)

In [None]:
targets = df.target.value_counts(normalize=True)

labels = ["More than 50K" if value else "Less than 50K" for value in targets.index]

plt.pie(targets.values, labels=labels, autopct="%.2f")
plt.title("Targets Distribution")
plt.show()

In [None]:
df

In [None]:
df.to_csv("../data/adult_whole_data.csv", index=False)