In [2]:
# Import required libraries
import numpy as np
import pandas as pd
import warnings
import logging

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from pickle import dump
from pickle import load


In [8]:
df = pd.read_csv('../data/census_intermediate.csv')

In [4]:
y = df['salary']
X = df.drop('salary', axis = 1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [6]:
y_test

9646     <=50K
709      <=50K
7385      >50K
16671    <=50K
21932    <=50K
         ...  
5889      >50K
25723    <=50K
29514    <=50K
1600     <=50K
639       >50K
Name: salary, Length: 6513, dtype: object

In [7]:
def process_data(data_df):
    data_df['salary'] = data_df['salary'].replace({'<=50K': 0, '>50K':1})
    LE_Features = ['workclass', 'marital-status', 'occupation', 
                   'relationship', 'race','sex', 'native-country']

    for feature in LE_Features:
        filepath = '../model/{}_encoder.pkl'.format(feature)
        le = load(open(filepath, 'rb'))
        data_df[feature] = le.transform(data_df[feature])
        #print('Encoding for {}: ... done'.format(feature))
        
    y = data_df['salary']
    X = data_df.drop(['salary', 'education'], axis = 1)
    
    filepath = '../model/robust_scaler.pkl'
    scaler = load(open(filepath, 'rb'))

    X = scaler.transform(X)
    
    return X,y

In [9]:
X, y = process_data(df)

In [10]:
X

array([[ 0.1       ,  3.        , -0.84274115, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.65      ,  2.        , -0.7944218 , ...,  0.        ,
        -5.4       ,  0.        ],
       [ 0.05      ,  0.        ,  0.30900203, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.05      ,  0.        , -0.22243577, ...,  0.        ,
         0.        ,  0.        ],
       [-0.75      ,  0.        ,  0.19096774, ...,  0.        ,
        -4.        ,  0.        ],
       [ 0.75      ,  1.        ,  0.91168899, ...,  0.        ,
         0.        ,  0.        ]])

In [11]:
y

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: salary, Length: 32561, dtype: int64