In [None]:
"""
Notebook to test and try machine learning model.
author: Ung Van Tuan
Date: June 29th 2023
"""

In [39]:
# Import the necessary libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier
)
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    fbeta_score,
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    classification_report
)
from sklearn.metrics import precision_recall_fscore_support as score

import seaborn as sns
sns.set()

In [40]:
# Load the data into a dataframe
df = pd.read_csv('data/clean_census.csv')
# Display the ten first rows from the dataframe
df.head(10)

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,5,37,Private,284582,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,6,49,Private,160187,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,7,52,Self-emp-not-inc,209642,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,8,31,Private,45781,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,9,42,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [41]:
def process_data(
        X, categorical_features=[], label=None, training=True, encoder=None, lb=None
):

    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(*[categorical_features], axis=1)

    if training is True:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()
    else:
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.fit_transform(y.values).ravel()
        # Catch the case where y is None because we're doing inference.
        except AttributeError as error:
            print("Error occur: ", error)

    X = np.concatenate([X_continuous, X_categorical], axis=1)
    return X, y, encoder, lb

In [42]:
# Get the categorical feature except the column salary
categorical_features = list(df.select_dtypes(['object', 'category']).columns)[:-1]

# Show the columns
categorical_features

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [43]:
# Split the dataset into train and tét
train, test = train_test_split(df, shuffle=True, stratify=None, test_size=0.20, random_state=42)

In [44]:
# Create the OneHotEncoder and LabelBinarizer() objects
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
binarizer = LabelBinarizer()

In [45]:
# Get the metrics from the trained model
def compute_model_metrics(y, preds):
    fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
    precision = precision_score(y, preds, zero_division=1)
    recall = recall_score(y, preds, zero_division=1)
    return precision, recall, fbeta

In [46]:
# Optional: implement hyperparameter tunning.
def train_model(X_train, y_train, models):
    for key in models.keys():
        models[key].fit(X_train, y_train)
    return models

In [47]:
# Get the processed train data
X_train, y_train, encoder, lb = process_data(train, categorical_features=categorical_features, label="salary", training=True)

In [48]:
# Create a dictionary for different models
models = {}
models['Logistic Regression'] = LogisticRegression()
models['Extrat Classfier'] = ExtraTreesClassifier(n_estimators=50)
models['Support Vector Machines'] = LinearSVC()
models['Gradient Boosting'] = GradientBoostingClassifier(n_estimators=333, learning_rate=0.8, max_depth=5, random_state=0)
models['Decision Trees'] = DecisionTreeClassifier()
models['Random Forest'] = RandomForestClassifier()
models['XGB Classifier'] = XGBClassifier(objective='binary:logistic', eta=0.3, max_depth= 5, eval_metric = 'aucpr')
models['Naive Bayes'] = GaussianNB()
models['K-Nearest Neighbor'] = KNeighborsClassifier()

In [49]:
# Train the model
trained_models = train_model(X_train, y_train, models)



In [50]:
# Get the result metrics into a dataframe
def df_model_results(trained_models, X_data, y_data):
    fbeta, precision, recall= {}, {}, {}
    for key in trained_models.keys():
        predictions = trained_models[key].predict(X_data)

        fbeta[key] = fbeta_score(y_data, predictions, beta=1, zero_division=1)
        precision[key] = precision_score(predictions, y_data)
        recall[key] = recall_score(predictions, y_data)

    df_model = pd.DataFrame(index=models.keys(), columns=['fbeta', 'precision', 'recall'])
    df_model['fbeta'] = fbeta.values()
    df_model['precision'] = precision.values()
    df_model['recall'] = recall.values()

    return df_model

In [51]:
# Check the result of the model
df_train_results = df_model_results(trained_models, X_train, y_train)
df_train_results

Unnamed: 0,fbeta,precision,recall
Logistic Regression,0.381335,0.25882,0.724092
Extrat Classfier,1.0,1.0,1.0
Support Vector Machines,0.356181,0.241661,0.676999
Gradient Boosting,0.986041,0.979795,0.992366
Decision Trees,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0
XGB Classifier,0.770244,0.712316,0.83843
Naive Bayes,0.418564,0.306607,0.65931
K-Nearest Neighbor,0.480656,0.347659,0.778456


In [52]:
# Get the test processed data
X_test, y_test, encoder, lb = process_data(test, categorical_features=categorical_features, label="salary", training=False, encoder=encoder, lb=binarizer)

In [53]:
# Get the result metrics for the test dta into a dataframe
df_test_results = df_model_results(trained_models, X_test, y_test)
df_test_results

Unnamed: 0,fbeta,precision,recall
Logistic Regression,0.405901,0.283219,0.716088
Extrat Classfier,0.648466,0.600125,0.705279
Support Vector Machines,0.3763,0.259513,0.684211
Gradient Boosting,0.68641,0.650655,0.726323
Decision Trees,0.60941,0.601996,0.617008
Random Forest,0.688569,0.627573,0.762699
XGB Classifier,0.720244,0.662508,0.789004
Naive Bayes,0.436333,0.328135,0.65099
K-Nearest Neighbor,0.349398,0.253275,0.563107


In [54]:
# Import the necessary libraries
from bayes_opt import BayesianOptimization
import xgboost as xgb

In [55]:
def bo_tune_xgb(max_depth, gamma, eta):
    params = {
        'objective': 'binary:logistic',
        'max_depth': int(max_depth),
        'eta': eta,
        'eval_metric': 'aucpr'
    }

    #Cross validating with the specified parameters in 5 folds and 70 iterations
    cv_result = xgb.cv(params, training_xgb_matrix, num_boost_round=70, nfold=5)
    #Return the resul
    cv_result = cv_result['train-aucpr-mean'].iloc[-1]
    return 1.0 * cv_result

In [56]:
# Instantiate a BayesianOptimization
xgb_bo = BayesianOptimization(
    bo_tune_xgb, {
        'max_depth': (3, 7),
        'gamma': (0, 1),
        'eta': (0.01, 0.4),
    }
)

In [57]:
# Group the train data into a xgb.DMatrix
training_xgb_matrix = xgb.DMatrix(X_train, label=y_train)
test_xgb_matrix = xgb.DMatrix(X_test, label=y_test)

In [58]:
# Run the optimization
xgb_bo.maximize(n_iter=6, init_points=8)


|   iter    |  target   |    eta    |   gamma   | max_depth |
-------------------------------------------------------------
| [0m1        [0m | [0m0.8452   [0m | [0m0.1869   [0m | [0m0.9652   [0m | [0m4.432    [0m |
| [95m2        [0m | [95m0.9106   [0m | [95m0.3786   [0m | [95m0.4149   [0m | [95m6.661    [0m |
| [0m3        [0m | [0m0.831    [0m | [0m0.05232  [0m | [0m0.06874  [0m | [0m6.202    [0m |
| [0m4        [0m | [0m0.8619   [0m | [0m0.119    [0m | [0m0.6762   [0m | [0m6.994    [0m |
| [0m5        [0m | [0m0.7861   [0m | [0m0.02774  [0m | [0m0.4576   [0m | [0m4.14     [0m |
| [0m6        [0m | [0m0.8441   [0m | [0m0.3629   [0m | [0m0.9114   [0m | [0m3.305    [0m |
| [0m7        [0m | [0m0.8597   [0m | [0m0.1783   [0m | [0m0.5746   [0m | [0m5.99     [0m |
| [0m8        [0m | [0m0.8281   [0m | [0m0.1991   [0m | [0m0.4917   [0m | [0m3.658    [0m |
| [0m9        [0m | [0m0.8418   [0m | [0m0.3222   

In [59]:
# Show the best hyperparameters
params = xgb_bo.max['params']
print(params)

{'eta': 0.4, 'gamma': 0.0, 'max_depth': 7.0}


In [60]:
# Retrain the model with the best hyperparameter
params = {
    'objective': 'binary:logistic',
    'eta': round(params['eta'], 1),
    'max_depth': round(params['max_depth']),
    'gamma': round(params['gamma']),
    'eval_metric': 'aucpr',
}

# Create a list of xgb.DMatrix
watch_list = [
    (test_xgb_matrix, 'eval'),
    (training_xgb_matrix, 'train')
]

# Train the model with the selected hyperparameters
xgb_model = xgb.train(params,
                      training_xgb_matrix,
                      num_boost_round=999,
                      evals=watch_list,
                      early_stopping_rounds=20)

[0]	eval-aucpr:0.77340	train-aucpr:0.77031
[1]	eval-aucpr:0.79360	train-aucpr:0.79579
[2]	eval-aucpr:0.80517	train-aucpr:0.81203
[3]	eval-aucpr:0.80748	train-aucpr:0.81581
[4]	eval-aucpr:0.81247	train-aucpr:0.82120
[5]	eval-aucpr:0.81456	train-aucpr:0.82496
[6]	eval-aucpr:0.82135	train-aucpr:0.83151
[7]	eval-aucpr:0.82306	train-aucpr:0.83587
[8]	eval-aucpr:0.82393	train-aucpr:0.83892
[9]	eval-aucpr:0.82563	train-aucpr:0.84064
[10]	eval-aucpr:0.82604	train-aucpr:0.84465
[11]	eval-aucpr:0.82502	train-aucpr:0.84764
[12]	eval-aucpr:0.82588	train-aucpr:0.85003
[13]	eval-aucpr:0.82665	train-aucpr:0.85238
[14]	eval-aucpr:0.82806	train-aucpr:0.85442
[15]	eval-aucpr:0.82741	train-aucpr:0.85921
[16]	eval-aucpr:0.82959	train-aucpr:0.86283
[17]	eval-aucpr:0.82968	train-aucpr:0.86436
[18]	eval-aucpr:0.83044	train-aucpr:0.86506
[19]	eval-aucpr:0.83142	train-aucpr:0.86697
[20]	eval-aucpr:0.83093	train-aucpr:0.86883
[21]	eval-aucpr:0.83093	train-aucpr:0.87010
[22]	eval-aucpr:0.83102	train-aucpr:0.8722

In [61]:
import pickle
pickle.dump(xgb_model, open("model/xgb_model.pkl", "wb"))

In [62]:
def classify_type(y_pred, y_label):
    rs = 'TP' if y_pred == 1 and y_label == 1 else 'FP' if y_pred == 1 and y_label == 0 else 'TN' if y_pred == 0 and y_label == 0 else 'FN'
    return rs

def evaluation(threshold):
    test_evaluation = test.copy()
    predictions = xgb_model.predict(test_xgb_matrix)
    test_evaluation['label'] = test_evaluation.apply(lambda x: 0 if x['salary'] == "<=50k" else 1, axis=1)
    test_evaluation['predicted_score'] = predictions
    test_evaluation['predicted_label'] = test_evaluation.apply(lambda x: 1 if x['predicted_score'] >= threshold else 0, axis = 1)
    test_evaluation['type'] = test_evaluation.apply(lambda x: classify_type(x['predicted_label'], x['label']), axis = 1)
    y_predict = test_evaluation['predicted_label'].tolist()
    precision, recall, fscore, support = score(y_test, y_predict)
    accuracy = accuracy_score(y_test, y_predict)

    return round(recall[1], 2), round(precision[1], 2), round(fscore[1], 2), round(accuracy, 2), y_test, y_predict, support

In [63]:
# Get the evaluation result
threshold = 0.5
recall, precision, fscore, accuracy, y_test, y_predict, support = evaluation(threshold)

In [64]:
# Print the evaluation result
print('recall: {}' . format(recall))
print('precision: {}' .format(precision))
print('fscore: {}' .format(fscore))
print('support: {}' .format(support))
print('accuracy: {}' .format(accuracy))

recall: 0.64
precision: 0.75
fscore: 0.69
support: [4905 1603]
accuracy: 0.86


In [65]:
# Print the classification report result
print(classification_report(y_test, y_predict, target_names=['0', '1']))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      4905
           1       0.75      0.64      0.69      1603

    accuracy                           0.86      6508
   macro avg       0.82      0.78      0.80      6508
weighted avg       0.85      0.86      0.85      6508



In [66]:
# Show the distribution of the salary
test.groupby(['salary']).agg(person_count=("salary", "count")).reset_index()

Unnamed: 0,salary,person_count
0,<=50K,4905
1,>50K,1603


In [67]:
def compute_model_metrics(y, preds):
    fscore = f1_score(y, preds, zero_division=1)
    precision = precision_score(y, preds, zero_division=1)
    recall = recall_score(y, preds, zero_division=1)
    return precision, recall, fscore

In [68]:
def evaluation(data, predictions, y_test_threshold):
    data['label'] = y_test
    data['predicted_score'] = predictions
    data['predicted_label'] = data.apply(lambda x: 1 if x['predicted_score'] >= threshold else 0, axis = 1)
    data['type'] = data.apply(lambda x: classify_type(x['predicted_label'], x['label']), axis = 1)
    y_predict = data['predicted_label'].tolist()
    precision, recall, fscore, support = score(y_test, y_predict)
    accuracy = accuracy_score(y_test, y_predict)
    cl_report = classification_report(y_test, data['predicted_label'].values, target_names=['0', '1'])
    #logging.info(f"Classification report:\n{cl_report}")

    return round(recall[1], 2), round(precision[1], 2), \
           round(fscore[1], 2), round(accuracy, 2), y_test, y_predict, data

In [69]:
def compute_slices(df, feature, y, preds, threshold):
    slice_options = df[feature].unique().tolist()
    perf_df = pd.DataFrame(index=slice_options, columns=['feature', 'n_samples', 'precision', 'recall', 'fscore'])
    for option in slice_options:
        slice_mask = df[feature]==option

        slice_y = y[slice_mask]
        slice_preds = preds[slice_mask]
        precision, recall, fscore = compute_model_metrics(slice_y, slice_preds)

        perf_df.loc[option, 'feature'] = feature
        perf_df.loc[option, 'n_samples'] = len(slice_y)
        perf_df.loc[option, 'precision'] = precision
        perf_df.loc[option, 'recall'] = recall
        perf_df.loc[option, 'fscore'] = fscore

    # reorder columns in performance dataframe
    perf_df.reset_index(names='feature value', inplace=True)
    colList = list(perf_df.columns)
    colList[0], colList[1] =  colList[1], colList[0]
    perf_df = perf_df[colList]
    return perf_df


In [70]:
preds = xgb_model.predict(test_xgb_matrix)

In [71]:
np.array(y_predict)

array([0, 0, 0, ..., 0, 0, 1])

In [72]:
threshold = 0.50

In [73]:
for feature in enumerate(categorical_features):
    print(feature[1])

workclass
education
marital-status
occupation
relationship
race
sex
native-country


In [74]:
for feature in enumerate(categorical_features):
    performance_df = compute_slices(test, feature[1], y_test, np.array(y_predict), threshold)
    if feature[0] == 0:
        performance_df.to_csv("slice_pred.csv",  mode='a', index=False)
    else:
        performance_df.to_csv("slice_pred.csv",  mode='a', header=False, index=False)

In [75]:
X_train.shape[1] == X_test.shape[1]

True

In [76]:
import os
import pickle

In [77]:
# Test the model exist or not
model_path = "./model/xgb_models.pkl"
if os.path.isfile(model_path):
    model = pickle.load(open(model_path, 'rb'))
else:
    print("File does not exists")

File does not exists


In [4]:
import os

home_dir = os.path.expanduser("~")
print(home_dir)

/Users/jumet


In [5]:
current_dir = os.getcwd()
print(current_dir)

/Users/jumet/DE/Udacity/Cencus_Project/Census_Bureau_Prediction/starter


In [9]:
project_dir = os.path.dirname(os.path.abspath("model.py"))
print(project_dir)

/Users/jumet/DE/Udacity/Cencus_Project/Census_Bureau_Prediction/starter


In [10]:
absolute_path = '/eda/'
absolute_dir = os.path.dirname(absolute_path)
print(absolute_dir)  # Output: '/'

/eda
