# Model Building

In [42]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
from pickle import dump

In [3]:
df = pd.read_csv('../data/census_intermediate.csv')

In [4]:
df.head(5)

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
from sklearn.preprocessing import LabelEncoder

LE_Features = ['workclass', 'marital-status', 'occupation', 
               'relationship', 'race','sex', 'native-country']

for feature in LE_Features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    print(feature)
    print(mapping)
    filepath = '../model/{}_encoder.pkl'.format(feature)
    dump(le, open(filepath, 'wb'))
    print('Saved encoder to model/', '\n')

workclass
{'?': 0, 'Federal-gov': 1, 'Local-gov': 2, 'Never-worked': 3, 'Private': 4, 'Self-emp-inc': 5, 'Self-emp-not-inc': 6, 'State-gov': 7, 'Without-pay': 8}
Saved encoder to model/ 

marital-status
{'Divorced': 0, 'Married-AF-spouse': 1, 'Married-civ-spouse': 2, 'Married-spouse-absent': 3, 'Never-married': 4, 'Separated': 5, 'Widowed': 6}
Saved encoder to model/ 

occupation
{'?': 0, 'Adm-clerical': 1, 'Armed-Forces': 2, 'Craft-repair': 3, 'Exec-managerial': 4, 'Farming-fishing': 5, 'Handlers-cleaners': 6, 'Machine-op-inspct': 7, 'Other-service': 8, 'Priv-house-serv': 9, 'Prof-specialty': 10, 'Protective-serv': 11, 'Sales': 12, 'Tech-support': 13, 'Transport-moving': 14}
Saved encoder to model/ 

relationship
{'Husband': 0, 'Not-in-family': 1, 'Other-relative': 2, 'Own-child': 3, 'Unmarried': 4, 'Wife': 5}
Saved encoder to model/ 

race
{'Amer-Indian-Eskimo': 0, 'Asian-Pac-Islander': 1, 'Black': 2, 'Other': 3, 'White': 4}
Saved encoder to model/ 

sex
{' Female': 0, ' Male': 1}
Sa

In [6]:
df.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,7,77516,Bachelors,13,4,1,1,4,1,2174,0,40,39,<=50K
1,50,6,83311,Bachelors,13,2,4,0,4,1,0,0,13,39,<=50K
2,38,4,215646,HS-grad,9,0,6,1,4,1,0,0,40,39,<=50K
3,53,4,234721,11th,7,2,6,0,2,1,0,0,40,39,<=50K
4,28,4,338409,Bachelors,13,2,10,5,2,0,0,0,40,5,<=50K


In [7]:
df.drop('education', axis = 1, inplace = True)

In [8]:
df.head()

Unnamed: 0,age,workclass,fnlgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,7,77516,13,4,1,1,4,1,2174,0,40,39,<=50K
1,50,6,83311,13,2,4,0,4,1,0,0,13,39,<=50K
2,38,4,215646,9,0,6,1,4,1,0,0,40,39,<=50K
3,53,4,234721,7,2,6,0,2,1,0,0,40,39,<=50K
4,28,4,338409,13,2,10,5,2,0,0,0,40,5,<=50K


In [9]:
dict(zip(le.classes_, range(len(le.classes_))))

{'?': 0,
 'Cambodia': 1,
 'Canada': 2,
 'China': 3,
 'Columbia': 4,
 'Cuba': 5,
 'Dominican-Republic': 6,
 'Ecuador': 7,
 'El-Salvador': 8,
 'England': 9,
 'France': 10,
 'Germany': 11,
 'Greece': 12,
 'Guatemala': 13,
 'Haiti': 14,
 'Holand-Netherlands': 15,
 'Honduras': 16,
 'Hong': 17,
 'Hungary': 18,
 'India': 19,
 'Iran': 20,
 'Ireland': 21,
 'Italy': 22,
 'Jamaica': 23,
 'Japan': 24,
 'Laos': 25,
 'Mexico': 26,
 'Nicaragua': 27,
 'Outlying-US(Guam-USVI-etc)': 28,
 'Peru': 29,
 'Philippines': 30,
 'Poland': 31,
 'Portugal': 32,
 'Puerto-Rico': 33,
 'Scotland': 34,
 'South': 35,
 'Taiwan': 36,
 'Thailand': 37,
 'Trinadad&Tobago': 38,
 'United-States': 39,
 'Vietnam': 40,
 'Yugoslavia': 41}

In [10]:
df['salary'] = df['salary'].replace({'<=50K': 0, '>50K':1})

In [11]:
y = df['salary']
X = df.drop('salary', axis = 1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [13]:
scaler = RobustScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
filepath = '../model/robust_scaler.pkl'
dump(scaler, open(filepath, 'wb'))

In [15]:
Log_model = LogisticRegression(max_iter = 1000).fit(X_train, y_train)
y_pred_lr = Log_model.predict(X_test)
accuracy_score(y_test, y_pred_lr)

0.8284968524489482

In [16]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89      5026
           1       0.69      0.45      0.54      1487

    accuracy                           0.83      6513
   macro avg       0.77      0.69      0.72      6513
weighted avg       0.82      0.83      0.81      6513



In [17]:
Log_model = LogisticRegression(max_iter = 1000)

grid_values = {'C':[0.001,0.01, 0.1, 1, 10, 100, 1000, 10000], 'penalty': ['l2']}

cross_validation = StratifiedKFold(n_splits = 5)

grid_log_model = GridSearchCV(Log_model, param_grid = grid_values, scoring = 'accuracy', cv = cross_validation)
grid_log_model.fit(X_train, y_train)

In [18]:
grid_log_model.best_params_

{'C': 10000, 'penalty': 'l2'}

In [19]:
y_pred_lr = grid_log_model.predict(X_test)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89      5026
           1       0.70      0.45      0.55      1487

    accuracy                           0.83      6513
   macro avg       0.77      0.70      0.72      6513
weighted avg       0.82      0.83      0.82      6513



In [20]:
grid_values3 = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
                'n_estimators' : [2, 3, 5, 10, 15, 20, 30, 50]}

gbm = XGBClassifier()

cross_validation = StratifiedKFold(n_splits = 5)

grid_xgb_model = GridSearchCV(gbm, param_grid = grid_values3,scoring = "accuracy", cv = cross_validation)
grid_xgb_model.fit(X_train, y_train)

print("Best parameters found: ", grid_xgb_model.best_params_)

Best parameters found:  {'max_depth': 6, 'n_estimators': 30}


In [21]:
y_pred_xgb = grid_xgb_model.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92      5026
           1       0.75      0.66      0.70      1487

    accuracy                           0.87      6513
   macro avg       0.83      0.80      0.81      6513
weighted avg       0.87      0.87      0.87      6513



In [22]:
xgb_final = XGBClassifier(max_depth = 6, n_estimators = 30)
xgb_final.fit(X_train, y_train)

y_preds = xgb_final.predict(X_test)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92      5026
           1       0.75      0.66      0.70      1487

    accuracy                           0.87      6513
   macro avg       0.83      0.80      0.81      6513
weighted avg       0.87      0.87      0.87      6513



In [23]:
filepath = '../model/xgb_model.pkl'
dump(xgb_final, open(filepath, 'wb'))

In [24]:
df = pd.read_csv('../data/census_intermediate.csv')

In [25]:
df['salary'] = df['salary'].replace({'<=50K': 0, '>50K':1})

In [26]:
from pickle import load

In [27]:
LE_Features = ['workclass', 'marital-status', 'occupation', 
               'relationship', 'race','sex', 'native-country']

for feature in LE_Features:
    filepath = '../model/{}_encoder.pkl'.format(feature)
    le = load(open(filepath, 'rb'))
    df[feature] = le.transform(df[feature])
    print('Encoding for {}: ... done'.format(feature))

Encoding for workclass: ... done
Encoding for marital-status: ... done
Encoding for occupation: ... done
Encoding for relationship: ... done
Encoding for race: ... done
Encoding for sex: ... done
Encoding for native-country: ... done


In [28]:
y = df['salary']
X = df.drop(['salary', 'education'], axis = 1)

In [29]:
filepath = '../model/robust_scaler.pkl'
scaler = load(open(filepath, 'rb'))

X = scaler.transform(X)

In [30]:
filepath = '../model/xgb_model.pkl'
model = load(open(filepath, 'rb'))

y_preds = model.predict(X)
print(classification_report(y_preds, y))

              precision    recall  f1-score   support

           0       0.95      0.90      0.92     25886
           1       0.68      0.80      0.74      6675

    accuracy                           0.88     32561
   macro avg       0.81      0.85      0.83     32561
weighted avg       0.89      0.88      0.89     32561



In [31]:
def process_data(data_df):
    data_df['salary'] = data_df['salary'].replace({'<=50K': 0, '>50K':1})
    LE_Features = ['workclass', 'marital-status', 'occupation', 
                   'relationship', 'race','sex', 'native-country']

    for feature in LE_Features:
        filepath = '../model/{}_encoder.pkl'.format(feature)
        le = load(open(filepath, 'rb'))
        data_df[feature] = le.transform(data_df[feature])
        #print('Encoding for {}: ... done'.format(feature))
        
    y = data_df['salary']
    X = data_df.drop(['salary', 'education'], axis = 1)
    
    filepath = '../model/robust_scaler.pkl'
    scaler = load(open(filepath, 'rb'))

    X = scaler.transform(X)
    
    return X,y

In [32]:
df = pd.read_csv('../data/census_intermediate.csv')

In [33]:
df['salary'] = df['salary'].replace({'<=50K': 0, '>50K':1})

In [34]:
y = df['salary']
X = df.drop('salary', axis = 1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [36]:
LE_Features = ['workclass', 'education', 'marital-status', 'occupation', 
               'relationship', 'race','sex', 'native-country']

for feature in LE_Features:
    print(set(df[feature]))
    print(len(set(df[feature])))
    print(set(X_test[feature]))
    print(len(set(X_test[feature])))
    print('\n')

{'?', 'State-gov', 'Federal-gov', 'Private', 'Self-emp-inc', 'Local-gov', 'Self-emp-not-inc', 'Without-pay', 'Never-worked'}
9
{'?', 'State-gov', 'Federal-gov', 'Private', 'Self-emp-inc', 'Local-gov', 'Self-emp-not-inc', 'Without-pay', 'Never-worked'}
9


{'12th', 'Bachelors', '1st-4th', '9th', 'HS-grad', '10th', 'Masters', 'Some-college', '5th-6th', 'Assoc-acdm', 'Assoc-voc', 'Doctorate', 'Prof-school', '7th-8th', 'Preschool', '11th'}
16
{'12th', 'Bachelors', '1st-4th', 'HS-grad', '9th', '10th', 'Some-college', 'Masters', '5th-6th', 'Assoc-acdm', 'Assoc-voc', 'Doctorate', 'Prof-school', '7th-8th', 'Preschool', '11th'}
16


{'Separated', 'Divorced', 'Never-married', 'Widowed', 'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'}
7
{'Separated', 'Divorced', 'Never-married', 'Widowed', 'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'}
7


{'Prof-specialty', '?', 'Protective-serv', 'Craft-repair', 'Transport-moving', 'Armed-Forces', 'Farming-fishing', 'Oth

In [37]:
data = X_test.join(y_test)

In [38]:
data

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
9646,62,Self-emp-not-inc,26911,7th-8th,4,Widowed,Other-service,Not-in-family,White,Female,0,0,66,United-States,0
709,18,Private,208103,11th,7,Never-married,Other-service,Other-relative,White,Male,0,0,25,United-States,0
7385,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States,1
16671,33,Private,511517,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,0
21932,36,Private,292570,11th,7,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5889,39,Private,146091,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,20,United-States,1
25723,17,Private,347322,10th,6,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,0
29514,35,Private,290226,HS-grad,9,Never-married,Transport-moving,Own-child,White,Male,0,0,40,United-States,0
1600,30,Private,27207,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States,0


In [46]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [39]:
pd.options.mode.chained_assignment = None

In [53]:
cols = ['Feature', 'Slice', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
slice_performance_df = pd.DataFrame(columns = cols)

In [54]:
filepath = '../model/xgb_model.pkl'
model = load(open(filepath, 'rb'))

Cat_Features = ['workclass', 'education', 'marital-status', 'occupation', 
               'relationship', 'race','sex', 'native-country']

for feature in Cat_Features:
    for slc in data[feature].unique():
        df_slice = data[data[feature] == slc]
        X_slice, y_slice = process_data(df_slice)
        y_pred_slice = model.predict(X_slice)
        acc = accuracy_score(y_pred_slice, y_slice)
        prec = precision_score(y_pred_slice, y_slice, zero_division = 0)
        rec = recall_score(y_pred_slice, y_slice, zero_division = 0)
        f1 = f1_score(y_pred_slice, y_slice, zero_division = 0)
        
        row_data = {'Feature':feature, 'Slice':slc, 'Accuracy':acc,
                    'Precision':prec, 'Recall':rec, 'F1 Score':f1}
        slice_performance_df = slice_performance_df.append(row_data,
                                                           ignore_index = True)


In [56]:
slice_performance_df.to_csv('Performance_on_slices.csv', index = False)