In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('data/cleaned_adult-all.csv')

In [4]:
#analysis of the target column
df['gross_income'] = df['gross_income'].map({'>50K':1,'<=50K':0})


In [5]:
#gross income (target) column
df['gross_income'].value_counts()


gross_income
0    37109
1    11681
Name: count, dtype: int64

In [6]:
y = df['gross_income'] #target column
x = df.drop(columns=['gross_income','fnlwgt','education_num']) #prepare the training dataset - dropping irrelevant columns

In [7]:
#import libraries
from sklearn.model_selection import train_test_split,GridSearchCV,KFold,cross_val_score
from sklearn.preprocessing import StandardScaler,PolynomialFeatures,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

KeyboardInterrupt: 

In [None]:
#define the metrics
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
def metrics(y_pred,y_test):
    PrecisionScore = precision_score(y_pred,y_test)
    RecallScore = recall_score(y_pred,y_test)
    F1Score = f1_score(y_pred,y_test)
    AccuracyScore = accuracy_score(y_pred,y_test)
    return PrecisionScore,RecallScore,F1Score,AccuracyScore

In [None]:
x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.2,random_state=42 # split the dataset into four different sets
)
x_train.to_parquet('data/x_train.parquet',index=False)
x_test.to_parquet('data/x_test.parquet',index=False)
y_train.to_frame('y_train').to_parquet('data/y_train.parquet',index=False)
y_test.to_frame('y_test').to_parquet('data/y_test.parquet',index=False)

In [None]:
numeric_cols = x_train.select_dtypes(include='number').columns.tolist() #isolate numeric columns 
# we further divide the categorical columns into high cardinality and low cardinality columns
high_card_cols = ['education','occupation','workclass','native_country','marital_status']
low_card_cols = ['sex','relationship','race'] 

# numerical data pipeline - scaling numerical values, and introducing polynomial features
num_pipe = Pipeline(steps=[
    ('scaler',StandardScaler()),
    ('poly',PolynomialFeatures(include_bias=False)),
])
# high cardinality pipeline - use simpleImputer function to impute missing values, replacing missing values with the most frequent
# occurence of a value
# and the values are target encoded
high_card_pipe = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('target',TargetEncoder()),
])
#low cardinality pipeline - onehot encode values
low_card_pipe = Pipeline(steps=[
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])
# project the changes we have made to respective columns using ColumnTransformer 
preprocessor = ColumnTransformer(transformers=[
    ('num',num_pipe,numeric_cols),
    ('low_card',low_card_pipe,low_card_cols),
    ('high_card',high_card_pipe,high_card_cols),
])

# a dictionary of several models and their respective parameters
model_and_grid_params = {
    #Logistic Regression
        'Logistic Regression' : {
        'model': LogisticRegression(penalty='l2',solver='lbfgs',n_jobs=-1,verbose=2),
        'params' : {
            "preprocessor__num__poly__degree" : [1,2],
            "classifier__C": [0.1,1.0,10.0],
            "classifier__max_iter" : [1000,2000,3000]
        }
    },
    #Decision Trees
    'Decision Trees' : {
        'model' : DecisionTreeClassifier(),
        'params': {
            "preprocessor__num__poly__degree" : [1,2],
            'classifier__max_depth' : [5,10,None],
            'classifier__min_samples_split' : [100,150,200]
        }
    },
    #Random Forest classifier
    'Random Forest Classifier' : {
        'model' : RandomForestClassifier(),
        'params' : {
            "preprocessor__num__poly__degree" : [1,2],
            'classifier__n_estimators' : [80,100,120],
            'classifier__max_depth' : [5,10,None],
            'classifier__min_samples_split' : [100,150,200]
        }
    },
    #XGB Classifier
    'XGB Classifier' : {
        'model' : XGBClassifier(objective='binary:logistic',verbosity=1,random_state=42),
        'params' : {
            "preprocessor__num__poly__degree" : [1,2],
            'classifier__n_estimators' : [80,100,120],
            'classifier__learning_rate' : [0.1,0.5,1.0],
            'classifier__max_depth' : [3,5,8],
            'classifier__reg_lambda' : [0.1,0.5,1,10]
        }
    }
}

#cross validation - splitting training data into 5 folds
cv = KFold(n_splits=5,shuffle=True,random_state=42)

results = {}
for name,model_grid in model_and_grid_params.items():
    print(f'Training {name}. This May Take A While...')
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('classifier',model_grid['model'])
    ])

    model = GridSearchCV(
        estimator=pipe,
        param_grid=model_grid['params'],
        cv = cv,
        refit= True,
        scoring='f1_macro',
        n_jobs=-1,
        verbose=2,
        return_train_score=True
    )

    #Training
    model.fit(x_train,y_train)

    #Prediction
    y_pred = model.predict(x_test)
    # y_pred = (y_pred_proba >= 0.5).astype(int) # set decision threshold to 0.5
    
    PrecisionScore,RecallScore,F1Score,AccuracyScore = metrics(y_pred,y_test)

    # store results in a dictionary
    results[name] = {
        'Best estimator' : model.best_estimator_,
        'Best score' : model.best_score_,
        'Best params' : model.best_params_,
        'Best model' : model.best_estimator_,
        'Precision_score' : PrecisionScore,
        'recall_score' : RecallScore,
        'f1_score' : F1Score,
        'accuracy_score' : AccuracyScore
    }

    print('*'*50)

    # Save best model to joblib
    import joblib
    joblib.dump(model.best_estimator_,f'models/{name.replace(' ','_')}_best_model.pkl')
    print(f'Saved {name} best model to models/{name.replace(' ','_')}_best_model.pkl')

# Save summary of all results
import json
with open('models/model_results.json','w') as file:
    json.dump(results,file,indent=4)

# display results
for name,result in results.items():
    print(f'Model Name : {name}')
    print('Best CV score : ',result['Best score'])
    print('Best Params : ',result['Best params'])
    print('Precision Score',result['Precision_score'])
    print('Recall Score: ',result['recall_score'])
    print('f1_score : ',result['f1_score'])
    print('Accuracy Score : ',result['accuracy_score'])
    print('-'*50)