### Importing Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats

### Loading Data

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
data = pd.concat([X, y], axis = 1)

In [None]:
adult.variables

### EDA

In [None]:
data.info()

#### Exploring income encoding

In [None]:
# Class imbalance, preprocessing for income column needed
# Consider using f1 score instead of accuracy 
y.assign(count = y["income"]).groupby("income").count()

#### Exploring missing and unknown values

In [None]:
# All people with missing workclass also has missing occupation
data[data["workclass"].isna()]["occupation"].unique()

In [None]:
# People with missing occupation either had unknown occupation or never-worked
data[data["occupation"].isna()]["workclass"].unique()

In [None]:
# What is the ? value
data["workclass"].unique()

In [None]:
# People with unknown workclass, occupation, and native-country is marked with ?
data[data["workclass"] == "?"]

In [None]:
# Which columns have ? values
for col in data.columns:
    if "?" in np.array(data[col]):
        print(col)

#### Exploring how many rows impacted if drop missing / ? values

In [None]:
data[data["workclass"] == "?"]["occupation"].unique()

In [None]:
data[data["workclass"] == "?"]["native-country"].unique()

In [None]:
data[data["occupation"] == "?"]["workclass"].unique()

In [None]:
data[data["occupation"] == "?"]["native-country"].unique()

In [None]:
data[data["native-country"] == "?"]["workclass"].unique()

In [None]:
data[data["native-country"] == "?"]["occupation"].unique()

In [None]:
# Dropping doesn't impact much
data[(data["workclass"] != "?") | (data["occupation"] != "?") | (data["native-country"] != "?")].dropna()

#### Exploring possibly correlated features

In [None]:
# education-num can be used to encode education
data.groupby(["education", "education-num"])[["age"]].count()

#### Exploring how many of each categorical variable there are and distribution

In [None]:
# Number of classes in each categorical cariable
categorical = data.drop(columns = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", 
                                   "hours-per-week", "income"])
categorical_count = {}
for col in categorical.columns:
    categorical_count[col] = len(categorical[col].unique())
    
categorical_count

In [None]:
# Function to get count of classes in each feature
def get_distr(col):
    return data.groupby(col)[["age"]].count().sort_values("age")

In [None]:
get_distr("workclass")

In [None]:
get_distr("education")

In [None]:
get_distr("marital-status")

In [None]:
get_distr("occupation")

In [None]:
get_distr("relationship")

In [None]:
get_distr("race")

In [None]:
get_distr("sex")

In [None]:
get_distr("native-country")

#### Exploring capital gain and capital loss

In [None]:
data["capital-gain"].describe()

In [None]:
data["capital-loss"].describe()

### Preprocessing

1. Replace nan with ? to represent unknown category or drop all missing and ? values
2. Preprocess income to be a binary value
3. Drop education column since education-num encodes that

In [None]:
# Encodes if you made over 50k as 1 and under 50k as 0
data = data.assign(income = data["income"].str.split(".").str[0].apply(lambda x: 1 if x[0] == "<" else 0))