# Model Building

In [77]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [98]:
from pickle import dump

In [108]:
df = pd.read_csv('../data/census_intermediate.csv')

In [109]:
df.head(5)

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [110]:
from sklearn.preprocessing import LabelEncoder

LE_Features = ['workclass', 'marital-status', 'occupation', 
               'relationship', 'race','sex', 'native-country']

for feature in LE_Features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    print(feature)
    print(mapping)
    filepath = '../model/{}_encoder.pkl'.format(feature)
    dump(le, open(filepath, 'wb'))
    print('Saved encoder to model/', '\n')

workclass
{'?': 0, 'Federal-gov': 1, 'Local-gov': 2, 'Never-worked': 3, 'Private': 4, 'Self-emp-inc': 5, 'Self-emp-not-inc': 6, 'State-gov': 7, 'Without-pay': 8}
Saved encoder to model/ 

marital-status
{'Divorced': 0, 'Married-AF-spouse': 1, 'Married-civ-spouse': 2, 'Married-spouse-absent': 3, 'Never-married': 4, 'Separated': 5, 'Widowed': 6}
Saved encoder to model/ 

occupation
{'?': 0, 'Adm-clerical': 1, 'Armed-Forces': 2, 'Craft-repair': 3, 'Exec-managerial': 4, 'Farming-fishing': 5, 'Handlers-cleaners': 6, 'Machine-op-inspct': 7, 'Other-service': 8, 'Priv-house-serv': 9, 'Prof-specialty': 10, 'Protective-serv': 11, 'Sales': 12, 'Tech-support': 13, 'Transport-moving': 14}
Saved encoder to model/ 

relationship
{'Husband': 0, 'Not-in-family': 1, 'Other-relative': 2, 'Own-child': 3, 'Unmarried': 4, 'Wife': 5}
Saved encoder to model/ 

race
{'Amer-Indian-Eskimo': 0, 'Asian-Pac-Islander': 1, 'Black': 2, 'Other': 3, 'White': 4}
Saved encoder to model/ 

sex
{' Female': 0, ' Male': 1}
Sa

In [81]:
df.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,7,77516,Bachelors,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,Bachelors,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,HS-grad,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,11th,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,Bachelors,13,2,10,5,2,0,0,0,40,5,0


In [82]:
df.drop('education', axis = 1, inplace = True)

In [83]:
df.head()

Unnamed: 0,age,workclass,fnlgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,7,77516,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,13,2,10,5,2,0,0,0,40,5,0


In [84]:
dict(zip(le.classes_, range(len(le.classes_))))

{'<=50K': 0, '>50K': 1}

In [85]:
y = df['salary']
X = df.drop('salary', axis = 1)

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [87]:
scaler = RobustScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [112]:
filepath = '../model/robust_scaler.pkl'
dump(scaler, open(filepath, 'wb'))

In [88]:
Log_model = LogisticRegression(max_iter = 1000).fit(X_train, y_train)
y_pred_lr = Log_model.predict(X_test)
accuracy_score(y_test, y_pred_lr)

0.8284968524489482

In [89]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89      5026
           1       0.69      0.45      0.54      1487

    accuracy                           0.83      6513
   macro avg       0.77      0.69      0.72      6513
weighted avg       0.82      0.83      0.81      6513



In [90]:
Log_model = LogisticRegression(max_iter = 1000)

grid_values = {'C':[0.001,0.01, 0.1, 1, 10, 100, 1000, 10000], 'penalty': ['l2']}

cross_validation = StratifiedKFold(n_splits = 5)

grid_log_model = GridSearchCV(Log_model, param_grid = grid_values, scoring = 'accuracy', cv = cross_validation)
grid_log_model.fit(X_train, y_train)

In [91]:
grid_log_model.best_params_

{'C': 10000, 'penalty': 'l2'}

In [92]:
y_pred_lr = grid_log_model.predict(X_test)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89      5026
           1       0.70      0.45      0.55      1487

    accuracy                           0.83      6513
   macro avg       0.77      0.70      0.72      6513
weighted avg       0.82      0.83      0.82      6513



In [93]:
grid_values3 = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
                'n_estimators' : [2, 3, 5, 10, 15, 20, 30, 50]}

gbm = XGBClassifier()

cross_validation = StratifiedKFold(n_splits = 5)

grid_xgb_model = GridSearchCV(gbm, param_grid = grid_values3,scoring = "accuracy", cv = cross_validation)
grid_xgb_model.fit(X_train, y_train)

print("Best parameters found: ", grid_xgb_model.best_params_)

Best parameters found:  {'max_depth': 6, 'n_estimators': 30}


In [94]:
y_pred_xgb = grid_xgb_model.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92      5026
           1       0.75      0.66      0.70      1487

    accuracy                           0.87      6513
   macro avg       0.83      0.80      0.81      6513
weighted avg       0.87      0.87      0.87      6513



In [113]:
xgb_final = XGBClassifier(max_depth = 6, n_estimators = 30)
xgb_final.fit(X_train, y_train)

y_preds = xgb_final.predict(X_test)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92      5026
           1       0.75      0.66      0.70      1487

    accuracy                           0.87      6513
   macro avg       0.83      0.80      0.81      6513
weighted avg       0.87      0.87      0.87      6513



In [114]:
filepath = '../model/xgb_model.pkl'
dump(xgb_final, open(filepath, 'wb'))

In [136]:
df = pd.read_csv('../data/census_intermediate.csv')

In [116]:
df['salary'] = df['salary'].replace({'<=50K': 0, '>50K':1})

In [117]:
from pickle import load

In [186]:
LE_Features = ['workclass', 'marital-status', 'occupation', 
               'relationship', 'race','sex', 'native-country']

for feature in LE_Features:
    filepath = '../model/{}_encoder.pkl'.format(feature)
    le = load(open(filepath, 'rb'))
    df[feature] = le.transform(df[feature])
    print('Encoding for {}: ... done'.format(feature))

Encoding for workclass: ... done
Encoding for marital-status: ... done
Encoding for occupation: ... done
Encoding for relationship: ... done
Encoding for race: ... done
Encoding for sex: ... done
Encoding for native-country: ... done


In [187]:
y = df['salary']
X = df.drop(['salary', 'education'], axis = 1)

In [188]:
filepath = '../model/robust_scaler.pkl'
scaler = load(open(filepath, 'rb'))

X = scaler.transform(X)

In [189]:
filepath = '../model/xgb_model.pkl'
model = load(open(filepath, 'rb'))

y_preds = model.predict(X)
print(classification_report(y_preds, y))

TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=[0 1] and y_pred=['<=50K' '>50K']. Make sure that the predictions provided by the classifier coincides with the true labels.

In [177]:
def process_data(data_df):
    data_df['salary'] = data_df['salary'].replace({'<=50K': 0, '>50K':1})
    LE_Features = ['workclass', 'marital-status', 'occupation', 
                   'relationship', 'race','sex', 'native-country']

    for feature in LE_Features:
        filepath = '../model/{}_encoder.pkl'.format(feature)
        le = load(open(filepath, 'rb'))
        data_df[feature] = le.transform(data_df[feature])
        #print('Encoding for {}: ... done'.format(feature))
        
    y = data_df['salary']
    X = data_df.drop(['salary', 'education'], axis = 1)
    
    filepath = '../model/robust_scaler.pkl'
    scaler = load(open(filepath, 'rb'))

    X = scaler.transform(X)
    
    return X,y

In [191]:
df = pd.read_csv('../data/census_intermediate.csv')

In [163]:
df['salary'] = df['salary'].replace({'<=50K': 0, '>50K':1})

In [192]:
y = df['salary']
X = df.drop('salary', axis = 1)

In [193]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [194]:
LE_Features = ['workclass', 'education', 'marital-status', 'occupation', 
               'relationship', 'race','sex', 'native-country']

for feature in LE_Features:
    print(set(df[feature]))
    print(len(set(df[feature])))
    print(set(X_test[feature]))
    print(len(set(X_test[feature])))
    print('\n')

{'Self-emp-inc', 'State-gov', 'Without-pay', 'Self-emp-not-inc', '?', 'Private', 'Local-gov', 'Never-worked', 'Federal-gov'}
9
{'Self-emp-inc', 'State-gov', 'Without-pay', 'Self-emp-not-inc', '?', 'Private', 'Local-gov', 'Never-worked', 'Federal-gov'}
9


{'Bachelors', 'Prof-school', 'Some-college', '9th', '7th-8th', '10th', '5th-6th', '1st-4th', 'Assoc-acdm', 'Doctorate', 'Masters', 'HS-grad', '11th', 'Preschool', 'Assoc-voc', '12th'}
16
{'Bachelors', 'Prof-school', 'Some-college', '9th', '7th-8th', '10th', '5th-6th', '1st-4th', 'Assoc-acdm', 'Doctorate', 'Masters', 'HS-grad', '11th', 'Preschool', 'Assoc-voc', '12th'}
16


{'Married-spouse-absent', 'Widowed', 'Married-civ-spouse', 'Never-married', 'Divorced', 'Married-AF-spouse', 'Separated'}
7
{'Married-spouse-absent', 'Widowed', 'Married-civ-spouse', 'Never-married', 'Divorced', 'Married-AF-spouse', 'Separated'}
7


{'Prof-specialty', 'Other-service', 'Tech-support', 'Adm-clerical', 'Sales', 'Handlers-cleaners', 'Farming-fishing', '

In [195]:
data = X_test.join(y_test)

In [196]:
data

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
9646,62,Self-emp-not-inc,26911,7th-8th,4,Widowed,Other-service,Not-in-family,White,Female,0,0,66,United-States,<=50K
709,18,Private,208103,11th,7,Never-married,Other-service,Other-relative,White,Male,0,0,25,United-States,<=50K
7385,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States,>50K
16671,33,Private,511517,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
21932,36,Private,292570,11th,7,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5889,39,Private,146091,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,20,United-States,>50K
25723,17,Private,347322,10th,6,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,<=50K
29514,35,Private,290226,HS-grad,9,Never-married,Transport-moving,Own-child,White,Male,0,0,40,United-States,<=50K
1600,30,Private,27207,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States,<=50K


In [198]:
pd.options.mode.chained_assignment = None

In [199]:
filepath = '../model/xgb_model.pkl'
model = load(open(filepath, 'rb'))

Cat_Features = ['workclass', 'education', 'marital-status', 'occupation', 
               'relationship', 'race','sex', 'native-country']

for feature in Cat_Features:
    for slc in data[feature].unique():
        df_slice = data[data[feature] == slc]
        X_slice, y_slice = process_data(df_slice)
        y_pred_slice = model.predict(X_slice)
        print("{}:{}".format(feature, slc))
        print(accuracy_score(y_pred_slice, y_slice))
        print('\n')


workclass:Self-emp-not-inc
0.7963709677419355


workclass:Private
0.8850599781897491


workclass:Local-gov
0.8378378378378378


workclass:Federal-gov
0.8549222797927462


workclass:Self-emp-inc
0.7714285714285715


workclass:?
0.9105691056910569


workclass:State-gov
0.875


workclass:Without-pay
1.0


workclass:Never-worked
1.0


education:7th-8th
0.948905109489051


education:11th
0.9578059071729957


education:Bachelors
0.8101145038167938


education:HS-grad
0.887535145267104


education:Some-college
0.882312925170068


education:Masters
0.7633136094674556


education:Assoc-voc
0.846441947565543


education:Doctorate
0.84


education:5th-6th
0.9848484848484849


education:10th
0.9441340782122905


education:12th
0.9591836734693877


education:Assoc-acdm
0.8577981651376146


education:Prof-school
0.8113207547169812


education:9th
0.9479166666666666


education:1st-4th
0.9736842105263158


education:Preschool
1.0


marital-status:Widowed
0.9395348837209302


marital-status:Never-marr