In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
import numpy as np

import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv("adult.csv")
for column in data.columns:
    data[column].replace(' ?', None, inplace=True)
pro_data = data.dropna()
pro_data = pro_data.drop('fnlwgt', axis=1)

target_col = 'income-class'
non_num_cols = [column for column in pro_data.drop([target_col], axis = 1).columns if pro_data[column].dtypes == "O"]
num_cols = [column for column in pro_data.drop([target_col], axis = 1).columns if column not in non_num_cols]

num_data = pro_data[num_cols]
cat_data = pro_data[non_num_cols]

In [2]:
num_cols

['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [3]:
cat_data

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [4]:
encoder = OneHotEncoder(sparse = True)
enc_data = encoder.fit_transform(cat_data)
enc_df = pd.DataFrame.sparse.from_spmatrix(enc_data, columns = encoder.get_feature_names(cat_data.columns))
enc_df

Unnamed: 0,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,education_ 11th,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32557,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32558,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32559,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
binarizer = LabelBinarizer()
target_data = binarizer.fit_transform(pro_data[target_col])
target_data = target_data.reshape(1, -1)[0]
enc_df['income'] = target_data

In [6]:
fin_data = pd.concat([num_data.reset_index(), enc_df.reset_index()], axis = 1, join = 'inner').drop(['index'], axis = 1)
fin_data.to_csv("income.csv", index = False)

In [7]:
import json

num_cols = [col for col in num_cols]
one_hot_cols = list(enc_df.columns.drop('income'))

col_dict = {"continuous": num_cols, "discrete": one_hot_cols}

form = json.dumps(col_dict)
f = open("col_dict.json", "w")
f.write(form)
f.close()

In [8]:
fin_data.columns

Index(['age', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass_ Federal-gov', 'workclass_ Local-gov',
       'workclass_ Never-worked', 'workclass_ Private',
       'workclass_ Self-emp-inc',
       ...
       'native-country_ Puerto-Rico', 'native-country_ Scotland',
       'native-country_ South', 'native-country_ Taiwan',
       'native-country_ Thailand', 'native-country_ Trinadad&Tobago',
       'native-country_ United-States', 'native-country_ Vietnam',
       'native-country_ Yugoslavia', 'income'],
      dtype='object', length=105)

In [9]:
with open('col_dict.json') as file:
    col_dict = json.load(file)
file.close()
col_dict

{'continuous': ['age',
  'education-num',
  'capital-gain',
  'capital-loss',
  'hours-per-week'],
 'discrete': ['workclass_ Federal-gov',
  'workclass_ Local-gov',
  'workclass_ Never-worked',
  'workclass_ Private',
  'workclass_ Self-emp-inc',
  'workclass_ Self-emp-not-inc',
  'workclass_ State-gov',
  'workclass_ Without-pay',
  'education_ 10th',
  'education_ 11th',
  'education_ 12th',
  'education_ 1st-4th',
  'education_ 5th-6th',
  'education_ 7th-8th',
  'education_ 9th',
  'education_ Assoc-acdm',
  'education_ Assoc-voc',
  'education_ Bachelors',
  'education_ Doctorate',
  'education_ HS-grad',
  'education_ Masters',
  'education_ Preschool',
  'education_ Prof-school',
  'education_ Some-college',
  'marital-status_ Divorced',
  'marital-status_ Married-AF-spouse',
  'marital-status_ Married-civ-spouse',
  'marital-status_ Married-spouse-absent',
  'marital-status_ Never-married',
  'marital-status_ Separated',
  'marital-status_ Widowed',
  'occupation_ Adm-clerical'