# Adult Dataset

Link: https://archive.ics.uci.edu/dataset/2/adult

Prediction task: determine  whether a person's income is over 50k a year

Minortiy class: ">50K"   
Majority class: "<=50K"


Summary:
- split into train and test
- 48842 rows
- 14 features
- missing values are represented with "?"


In [1]:
import pandas as pd
import numpy as np
import json
import os

pd.set_option('display.max_columns', None)

In [20]:
data_path = "../data/raw/adult"
test_path = "../data/raw/adult/adult.test"
info_path = "../data/info/adult.json"

# load train
train_df = pd.read_csv(f"{data_path}/adult.data", header=None, skipinitialspace=True)
# load test
with open(test_path, "r") as f:
    lines = f.readlines()[1:]
    test_save_path = f"{data_path}/test.data"
    if not os.path.exists(test_save_path):
        with open(test_save_path, "a") as f1:
            for line in lines:
                save_line = line.strip("\n").strip(".")
                f1.write(f"{save_line}\n")

test_df = pd.read_csv(test_save_path, header=None, skipinitialspace=True)

# load columns names
with open(info_path, "r") as f:
    info = json.load(f)


train_df.columns = info["column_names"]
test_df.columns = info["column_names"]

In [21]:
train_df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [22]:
test_df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


## Summary

In [28]:
sum_rows = train_df.shape[0] + test_df.shape[0]
train_min_num = len(train_df[train_df["income"] == ">50K"])
train_maj_num = len(train_df[train_df["income"] == "<=50K"]) 


print("shape train:", train_df.shape)
print("shape test:", test_df.shape)
print("Sum rows:", sum_rows)
print()
print("Class Imbalance train:", train_min_num * 100 / train_df.shape[0])
print("Value counts label train:")
print(train_df["income"].value_counts())
print()
print("Value counts label test:")
print(test_df["income"].value_counts())

shape train: (32561, 15)
shape test: (16281, 15)
Sum rows: 48842

Class Imbalance train: 24.080955744602438
Value counts label train:
income
<=50K    24720
>50K      7841
Name: count, dtype: int64

Value counts label test:
income
<=50K    12435
>50K      3846
Name: count, dtype: int64


### How many rows do I need to generate
(1) Input into conditional generator is 50/50  
(2) no conditional generator -> only neet to generated the difference 


In [38]:
# for 1
min_amount = 7841
# times 2 since majoity class get downsampled to 7841
input_gen = min_amount * 2
reach_50 = 24720 - min_amount
to_gen = reach_50 * 2
print(to_gen)
# we expect a 50/50 distribution since train data is 50/50 so I need to divide by 2 and add the min amount and now we have the same amount of min data compared to majoirity data
print(to_gen / 2 + min_amount == 24720.0)


33758
True


In [39]:
# for 2
print(24720 - 7841)


16879


In [3]:
df_train_min = pd.read_csv("../data/processed/adult/train_min.csv")

In [7]:
df_train_min["education.num"].mean()

11.611656676444332

In [8]:
df_train_min["education.num"].describe()

count    7841.000000
mean       11.611657
std         2.385129
min         2.000000
25%        10.000000
50%        12.000000
75%        13.000000
max        16.000000
Name: education.num, dtype: float64