### Importing Packages

In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats

### Loading Data

In [14]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
data = pd.concat([X, y], axis = 1)

In [8]:
adult.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,age,Feature,Integer,Age,,,no
1,workclass,Feature,Categorical,Income,"Private, Self-emp-not-inc, Self-emp-inc, Feder...",,yes
2,fnlwgt,Feature,Integer,,,,no
3,education,Feature,Categorical,Education Level,"Bachelors, Some-college, 11th, HS-grad, Prof-...",,no
4,education-num,Feature,Integer,Education Level,,,no
5,marital-status,Feature,Categorical,Other,"Married-civ-spouse, Divorced, Never-married, S...",,no
6,occupation,Feature,Categorical,Other,"Tech-support, Craft-repair, Other-service, Sal...",,yes
7,relationship,Feature,Categorical,Other,"Wife, Own-child, Husband, Not-in-family, Other...",,no
8,race,Feature,Categorical,Race,"White, Asian-Pac-Islander, Amer-Indian-Eskimo,...",,no
9,sex,Feature,Binary,Sex,"Female, Male.",,no


### EDA

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


#### Exploring income encoding

In [47]:
# Class imbalance, preprocessing for income column needed
# Consider using f1 score instead of accuracy 
y.assign(count = y["income"]).groupby("income").count()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
<=50K,24720
<=50K.,12435
>50K,7841
>50K.,3846


#### Exploring missing and unknown values

In [21]:
# All people with missing workclass also has missing occupation
data[data["workclass"].isna()]["occupation"].unique()

array([nan], dtype=object)

In [48]:
# People with missing occupation either had unknown occupation or never-worked
data[data["occupation"].isna()]["workclass"].unique()

array([nan, 'Never-worked'], dtype=object)

In [35]:
# What is the ? value
data["workclass"].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked',
       nan], dtype=object)

In [36]:
# People with unknown workclass, occupation, and native-country is marked with ?
data[data["workclass"] == "?"]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
61,32,?,293936,7th-8th,4,Married-spouse-absent,?,Not-in-family,White,Male,0,0,40,?,<=50K
69,25,?,200681,Some-college,10,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,?,212759,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,2,United-States,<=50K
106,17,?,304873,10th,6,Never-married,?,Own-child,White,Female,34095,0,32,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,?,320084,Bachelors,13,Married-civ-spouse,?,Wife,White,Female,0,0,55,United-States,>50K
32531,30,?,33811,Bachelors,13,Never-married,?,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,<=50K
32539,71,?,287372,Doctorate,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K
32541,41,?,202822,HS-grad,9,Separated,?,Not-in-family,Black,Female,0,0,32,United-States,<=50K


In [64]:
# Which columns have ? values
for col in data.columns:
    if "?" in np.array(data[col]):
        print(col)

workclass
occupation
native-country


  if "?" in np.array(data[col]):


#### Exploring how many rows impacted if drop missing / ? values

In [50]:
data[data["workclass"] == "?"]["occupation"].unique()

array(['?'], dtype=object)

In [51]:
data[data["workclass"] == "?"]["native-country"].unique()

array(['South', '?', 'United-States', 'Italy', 'Canada', 'China',
       'Jamaica', 'Haiti', 'Honduras', 'Germany', 'Philippines', 'Mexico',
       'El-Salvador', 'Nicaragua', 'Iran', 'Poland', 'England', 'Taiwan',
       'Portugal', 'Trinadad&Tobago', 'Guatemala', 'Japan', 'Vietnam',
       'Columbia', 'Hong', 'Cuba', 'Laos', 'Ecuador', 'France',
       'Puerto-Rico', 'Dominican-Republic', 'Peru', 'Cambodia',
       'Thailand', 'Scotland'], dtype=object)

In [52]:
data[data["occupation"] == "?"]["workclass"].unique()

array(['?', 'Never-worked'], dtype=object)

In [53]:
data[data["occupation"] == "?"]["native-country"].unique()

array(['South', '?', 'United-States', 'Italy', 'Canada', 'China',
       'Jamaica', 'Haiti', 'Honduras', 'Germany', 'Philippines', 'Mexico',
       'El-Salvador', 'Nicaragua', 'Iran', 'Poland', 'England', 'Taiwan',
       'Portugal', 'Trinadad&Tobago', 'Guatemala', 'Japan', 'Vietnam',
       'Columbia', 'Hong', 'Cuba', 'Laos', 'Ecuador', 'France',
       'Puerto-Rico', 'Dominican-Republic', 'Peru', 'Cambodia',
       'Thailand', 'Scotland'], dtype=object)

In [54]:
data[data["native-country"] == "?"]["workclass"].unique()

array(['Private', '?', 'State-gov', 'Self-emp-not-inc', 'Self-emp-inc',
       'Local-gov', 'Federal-gov'], dtype=object)

In [55]:
data[data["native-country"] == "?"]["occupation"].unique()

array(['Craft-repair', 'Sales', 'Other-service', '?', 'Adm-clerical',
       'Exec-managerial', 'Prof-specialty', 'Machine-op-inspct',
       'Transport-moving', 'Handlers-cleaners', 'Priv-house-serv',
       'Farming-fishing', 'Tech-support', 'Protective-serv'], dtype=object)

In [74]:
# Dropping doesn't impact much
data[(data["workclass"] != "?") | (data["occupation"] != "?") | (data["native-country"] != "?")].dropna()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


#### Exploring possibly correlated features

In [46]:
# education-num can be used to encode education
data.groupby(["education", "education-num"])[["age"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,age
education,education-num,Unnamed: 2_level_1
10th,6,1389
11th,7,1812
12th,8,657
1st-4th,2,247
5th-6th,3,509
7th-8th,4,955
9th,5,756
Assoc-acdm,12,1601
Assoc-voc,11,2061
Bachelors,13,8025


#### Exploring how many of each categorical variable there are and distribution

In [103]:
# Number of classes in each categorical cariable
categorical = data.drop(columns = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", 
                                   "hours-per-week", "income"])
categorical_count = {}
for col in categorical.columns:
    categorical_count[col] = len(categorical[col].unique())
    
categorical_count

{'workclass': 10,
 'education': 16,
 'marital-status': 7,
 'occupation': 16,
 'relationship': 6,
 'race': 5,
 'sex': 2,
 'native-country': 43}

In [107]:
# Function to get count of classes in each feature
def get_distr(col):
    return data.groupby(col)[["age"]].count().sort_values("age")

In [108]:
get_distr("workclass")

Unnamed: 0_level_0,age
workclass,Unnamed: 1_level_1
Never-worked,10
Without-pay,21
Federal-gov,1432
Self-emp-inc,1695
?,1836
State-gov,1981
Local-gov,3136
Self-emp-not-inc,3862
Private,33906


In [109]:
get_distr("education")

Unnamed: 0_level_0,age
education,Unnamed: 1_level_1
Preschool,83
1st-4th,247
5th-6th,509
Doctorate,594
12th,657
9th,756
Prof-school,834
7th-8th,955
10th,1389
Assoc-acdm,1601


In [110]:
get_distr("marital-status")

Unnamed: 0_level_0,age
marital-status,Unnamed: 1_level_1
Married-AF-spouse,37
Married-spouse-absent,628
Widowed,1518
Separated,1530
Divorced,6633
Never-married,16117
Married-civ-spouse,22379


In [111]:
get_distr("occupation")

Unnamed: 0_level_0,age
occupation,Unnamed: 1_level_1
Armed-Forces,15
Priv-house-serv,242
Protective-serv,983
Tech-support,1446
Farming-fishing,1490
?,1843
Handlers-cleaners,2072
Transport-moving,2355
Machine-op-inspct,3022
Other-service,4923


In [112]:
get_distr("relationship")

Unnamed: 0_level_0,age
relationship,Unnamed: 1_level_1
Other-relative,1506
Wife,2331
Unmarried,5125
Own-child,7581
Not-in-family,12583
Husband,19716


In [113]:
get_distr("race")

Unnamed: 0_level_0,age
race,Unnamed: 1_level_1
Other,406
Amer-Indian-Eskimo,470
Asian-Pac-Islander,1519
Black,4685
White,41762


In [114]:
get_distr("sex")

Unnamed: 0_level_0,age
sex,Unnamed: 1_level_1
Female,16192
Male,32650


In [115]:
get_distr("native-country")

Unnamed: 0_level_0,age
native-country,Unnamed: 1_level_1
Holand-Netherlands,1
Hungary,19
Honduras,20
Scotland,21
Laos,23
Outlying-US(Guam-USVI-etc),23
Yugoslavia,23
Trinadad&Tobago,27
Cambodia,28
Thailand,30


#### Exploring capital gain and capital loss

In [136]:
data["capital-gain"].describe()

count    48842.000000
mean      1079.067626
std       7452.019058
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      99999.000000
Name: capital-gain, dtype: float64

In [137]:
data["capital-loss"].describe()

count    48842.000000
mean        87.502314
std        403.004552
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max       4356.000000
Name: capital-loss, dtype: float64

### Preprocessing

1. Replace nan with ? to represent unknown category or drop all missing and ? values
2. Preprocess income to be a binary value
3. Drop education column since education-num encodes that

In [85]:
# Encodes if you made over 50k as 1 and under 50k as 0
data = data.assign(income = data["income"].str.split(".").str[0].apply(lambda x: 1 if x[0] == "<" else 0))