In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, RocCurveDisplay
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
df = pd.read_csv('../lecture_03/lecture_03_data.csv',
                names = ['age',
                         'workclass',
                         'fnlwgt',
                         'education',
                         'education-num',
                         'marital-status',
                         'occupation',
                         'relationship',
                         'race',
                         'sex',
                         'capital-gain',
                         'capital-loss',
                         'hours-per-week',
                         'native-country',
                         'income'])

In [4]:
num_vars = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_vars = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
target = 'income'

In [5]:
X = df[num_vars + cat_vars]
y = df[target].replace({' <=50K': 0, ' >50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [6]:
numeric_tx = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])
cat_tx = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_tx, num_vars),
    ('categorical', cat_tx, cat_vars)
])

In [25]:
# param_grid = {'learning_rate': [0.05, 0.01], 'n_estimators': [10, 800]}
param_grid = {'min_samples_leaf': [1, 5], 'n_estimators': [500]}  #RF

In [26]:
rf = RandomForestClassifier()
#gbm = GradientBoostingClassifier()
clf = GridSearchCV(gbm, param_grid, scoring='roc_auc')

In [27]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', clf)])
model_pipeline = pipeline.fit(X_train, y_train)
preds = model_pipeline.predict_proba(X_test)

In [28]:
print(roc_auc_score(y_test, preds[:,1]))

0.923163616316264


In [29]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,13.519662,0.341735,0.044872,0.00077,1,500,"{'min_samples_leaf': 1, 'n_estimators': 500}",0.925805,0.926654,0.924298,0.932721,0.933085,0.928513,0.003665,2
1,13.340169,0.341372,0.044371,0.001228,5,500,"{'min_samples_leaf': 5, 'n_estimators': 500}",0.92623,0.927232,0.924555,0.933845,0.933226,0.929018,0.003792,1
