In [1]:
# importing necessarily libraries for the binary classification task

# libraries imported for data processing and analysis
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

# libraries imported for learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import pipeline

# libraries imported for performance metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [6]:
# load 'Name-Gender' data and names into pandas dataframe

# load data by using read_csv from .data file
df = pd.read_csv("datasets/Name_Gender/name_gender.csv")

# clean data
# replace string label classifiers into binary values
df = df.replace(to_replace="M", value=1)
df = df.replace(to_replace="F", value=0)
# drop all samples with NaN entries
df = df.dropna()

# move binary classifier(label) column to the end
# hold column
classifier = df['Gender']
# drop column from dataframe
df.drop(columns=['Gender', 'Name'], inplace=True)
# reinsert into dataframe at the end
df['Gender'] = classifier

df

Unnamed: 0,Count,Probability,Gender
0,5304407,1.451679e-02,1
1,5260831,1.439753e-02,1
2,4970386,1.360266e-02,1
3,4579950,1.253414e-02,1
4,4226608,1.156713e-02,1
...,...,...,...
147264,1,2.736740e-09,1
147265,1,2.736740e-09,1
147266,1,2.736740e-09,1
147267,1,2.736740e-09,1


In [7]:
# pre-declared values/arrays/functions to be used once inside the trial loop
# C values for logistic regression regularization in range of 10(-8) to 10(4)
Cvals = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
# K values for k-nearest neighbors in range of 1 to 105 in steps of 4
Kvals = np.linspace(1, 105, num=26, dtype=int).tolist()
# max feature values for random forest similar to CNM06
max_features = [1, 2, 4, 6, 8, 12, 16, 20]
# max depth values for decision trees (shallower = better)
max_depths = np.linspace(1, 5, num=5, dtype=int).tolist()
# array of performance metrics
scoring = ['accuracy', 'f1_micro', 'roc_auc_ovr']

# build parameter grids to be passed into GridSearchCV
logreg_pgrid = {'classifier__penalty': ['l1','l2'], 'classifier__C': Cvals, 'classifier__max_iter': [5000]}
knn_pgrid = {'classifier__weights': ['distance'], 'classifier__n_neighbors': Kvals}
rforest_pgrid = {'classifier__n_estimators': [1024], 'classifier__max_features': max_features}
dtree_pgrid = {'classifier__max_depth': max_depths}

# arrays + dictionaries to store scores
score_dict = [{}, {}, {}, {}, {}]

# loop through this entire trial FIVE (5) times
for i in range(5):
    # slice the dataframe to not include the binary classifier (label)
    # last column is the label (income>50K)
    X, y = df.iloc[:,:-1], df.iloc[:,-1]

    # randomly pick 5000 samples with replacement for training set
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)

    # make pipeline for each algorithms to condense model call
    logreg = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', LogisticRegression(n_jobs=-1))])
    knn = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', KNeighborsClassifier(n_jobs=-1))])
    rforest = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', RandomForestClassifier(n_jobs=-1))])
    dtree = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', DecisionTreeClassifier())])

    # 5-fold cross validation using Stratified KFold
    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

    # GridSearchCV classifier for each algorithm
    logreg_clf = GridSearchCV(estimator=logreg, param_grid=logreg_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)
    knn_clf = GridSearchCV(estimator=knn, param_grid=knn_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)
    rforest_clf = GridSearchCV(estimator=rforest, param_grid=rforest_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)
    dtree_clf = GridSearchCV(estimator=dtree, param_grid=dtree_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)



    # for each classifier
    for clf, clf_name in zip([logreg_clf, knn_clf, rforest_clf, dtree_clf], 
                ['LogReg', 'KNN', 'Ran_For', 'Dec_Tree']):
        # fit to training data of 5000 samples
        clf.fit(X_train, y_train)

        # get parameters for each scoring metric's best
        best_acc_param = clf.cv_results_['params'][ np.argmin(clf.cv_results_['rank_test_accuracy']) ]
        best_f1_param = clf.cv_results_['params'][ np.argmin(clf.cv_results_['rank_test_f1_micro']) ]
        best_roc_param = clf.cv_results_['params'][ np.argmin(clf.cv_results_['rank_test_roc_auc_ovr']) ]

        # get pipeline based on current classifier
        if (clf_name == 'LogReg'):
            pipe = logreg
        elif (clf_name == 'KNN'):
            pipe = knn
        elif (clf_name == 'Ran_For'):
            pipe = rforest
        elif (clf_name == 'Dec_Tree'):
            pipe = dtree

        # set pipeline parameters to the parameters for best accuracy
        pipe.set_params(**best_acc_param)
        # fit classifier with training data and new parameters for scoring metric
        pipe.fit(X_train, y_train)
        # get predictions for both training and testing data
        y_train_pred = pipe.predict(X_train)
        y_test_pred = pipe.predict(X_test)

        # get scores for all metrics from both training and testing data
        acc_train = accuracy_score(y_train, y_train_pred)
        f1_train = f1_score(y_train, y_train_pred)
        roc_auc_train = roc_auc_score(y_train, y_train_pred)

        acc_test = accuracy_score(y_test, y_test_pred)
        f1_test = f1_score(y_test, y_test_pred)
        roc_auc_test = roc_auc_score(y_test, y_test_pred)

        # store all scores into a dictionary for accuracy metric
        acc_dict = {'acc_train': acc_train, 'f1_train': f1_train, 'roc_auc_train': roc_auc_train, 
                    'acc_test': acc_test, 'f1_test': f1_test, 'roc_auc_test': roc_auc_test}

        
        # do ^^^^^ all that for f1 score
        pipe.set_params(**best_f1_param)
        pipe.fit(X_train, y_train)
        y_train_pred = pipe.predict(X_train)
        y_test_pred = pipe.predict(X_test)

        acc_train = accuracy_score(y_train, y_train_pred)
        f1_train = f1_score(y_train, y_train_pred)
        roc_auc_train = roc_auc_score(y_train, y_train_pred)

        acc_test = accuracy_score(y_test, y_test_pred)
        f1_test = f1_score(y_test, y_test_pred)
        roc_auc_test = roc_auc_score(y_test, y_test_pred)

        f1_dict = {'acc_train': acc_train, 'f1_train': f1_train, 'roc_auc_train': roc_auc_train, 
                    'acc_test': acc_test, 'f1_test': f1_test, 'roc_auc_test': roc_auc_test}


        # do ^^^^^ all that for roc_auc score
        pipe.set_params(**best_roc_param)
        pipe.fit(X_train, y_train)
        y_train_pred = pipe.predict(X_train)
        y_test_pred = pipe.predict(X_test)

        acc_train = accuracy_score(y_train, y_train_pred)
        f1_train = f1_score(y_train, y_train_pred)
        roc_auc_train = roc_auc_score(y_train, y_train_pred)

        acc_test = accuracy_score(y_test, y_test_pred)
        f1_test = f1_score(y_test, y_test_pred)
        roc_auc_test = roc_auc_score(y_test, y_test_pred)

        roc_auc_dict = {'acc_train': acc_train, 'f1_train': f1_train, 'roc_auc_train': roc_auc_train, 
                    'acc_test': acc_test, 'f1_test': f1_test, 'roc_auc_test': roc_auc_test}

        # build final dictionary to store all scores from all three models and their best parameters
        score_dict[i][clf_name] = {'acc_dict': acc_dict, 'f1_dict': f1_dict, 'roc_auc_dict': roc_auc_dict}

Fitting 5 folds for each of 26 candidates, totalling 130 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed:    0.3s finished
Fitting 5 folds for each of 26 candidates, totalling 130 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 115 out of 130 | elapsed:    1.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed:    1.2s finished
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   10.7s finished
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend

In [8]:
print(score_dict)

[{'LogReg': {'acc_dict': {'acc_train': 0.6058, 'f1_train': 0.0, 'roc_auc_train': 0.5, 'acc_test': 0.6095495153547154, 'f1_test': 0.0, 'roc_auc_test': 0.5}, 'f1_dict': {'acc_train': 0.6058, 'f1_train': 0.0, 'roc_auc_train': 0.5, 'acc_test': 0.6095495153547154, 'f1_test': 0.0, 'roc_auc_test': 0.5}, 'roc_auc_dict': {'acc_train': 0.6058, 'f1_train': 0.0, 'roc_auc_train': 0.5, 'acc_test': 0.6095495153547154, 'f1_test': 0.0, 'roc_auc_test': 0.5}}, 'KNN': {'acc_dict': {'acc_train': 0.679, 'f1_train': 0.38054805094558086, 'roc_auc_train': 0.6040994050577212, 'acc_test': 0.5827903478621485, 'f1_test': 0.16681639528354855, 'roc_auc_test': 0.49727467092731326}, 'f1_dict': {'acc_train': 0.679, 'f1_train': 0.38054805094558086, 'roc_auc_train': 0.6040994050577212, 'acc_test': 0.5827903478621485, 'f1_test': 0.16681639528354855, 'roc_auc_test': 0.49727467092731326}, 'roc_auc_dict': {'acc_train': 0.679, 'f1_train': 0.38054805094558086, 'roc_auc_train': 0.6040994050577212, 'acc_test': 0.5825232482128925