# Modeling

Time to model our data, we're going to be using a lot of different classifiers and trying to find which ones will be the best

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, f1_score

## Reading in the data

In [7]:
# Reading in data with dummies and Lasso CV
dummy_income = pd.read_csv('./data/train_modeling.csv')

In [8]:
orig_income = pd.read_csv('./data/train_cleaned.csv')

In [9]:
# Selecting Features
features = dummy_income._get_numeric_data().columns
X = dummy_income[features]
y = orig_income['wage']

In [10]:
features

Index(['marital-status_Married-civ-spouse', 'education-num', 'capital-gain',
       'native-country_United-States', 'occupation_Exec-managerial',
       'capital-loss', 'workclass_unknown', 'hours-per-week',
       'workclass_Self-emp-not-inc', 'workclass_Private', 'age',
       'native-country_Mexico', 'native-country_unknown',
       'workclass_Local-gov', 'education_HS-grad', 'occupation_Prof-specialty',
       'workclass_State-gov', 'native-country_South',
       'occupation_Farming-fishing', 'education_Assoc-acdm', 'sex_1'],
      dtype='object')

## Our baseline model

In [11]:
y.value_counts(normalize=True)

0    0.75919
1    0.24081
Name: wage, dtype: float64

Our baseline model has an accuracy of ~76%, this is our score to beat!

In [12]:
def model_score_classification(X, y, models: list):
    # Split data into training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
    
    
    # Creating empty df to add to later
    models_df = pd.DataFrame(columns=['model', 
                                      'parameters', 
                                      'train_accuracy',
                                      'train_f1',
                                      'train_spec',
                                      'train_sens',
                                      'test_accuracy',
                                      'test_f1',
                                      'test_spec',
                                      'test_sens'])
    
    for model in models:
        # Create a pipeline to scale data and pass through model
        pipe = Pipeline([
            ('sc', StandardScaler()),
            ('model', model) # Thanks Lisa Tagliaferri from Digitalocean.com https://www.digitalocean.com/community/tutorials/how-to-use-args-and-kwargs-in-python-3
        ])

        # Fitting the model
        pipe.fit(X_train, y_train)
        
        # Predictions
        y_train_preds = pipe.predict(X_train)
        y_test_preds = pipe.predict(X_test)
        
        # Scoring the models
        train_score = pipe.score(X_train, y_train)
        train_f1 = f1_score(y_train, y_train_preds)
        test_score = pipe.score(X_test, y_test)
        test_f1 = f1_score(y_test, y_test_preds)
        
        # Calculate train specificity and sensitivity
        tn, fn, fp, tp = confusion_matrix(y_train, pipe.predict(X_train)).ravel()
        train_spec = tn / (tn + fp)
        train_sens = tp / (tp + fn)
        
        # Calculate test specificity and sensitivity
        tn, fn, fp, tp = confusion_matrix(y_test, pipe.predict(X_test)).ravel()
        test_spec = tn / (tn + fp)
        test_sens = tp / (tp + fn)
        
        # Returning a dictionary of the information
        model_row = {'model' : type(model).__name__, # Thanks Jonathan from Stack Overflow! https://stackoverflow.com/questions/52763325/how-to-obtain-only-the-name-of-a-models-object-in-scikitlearn
                     'parameters' : model.get_params(),
                     'train_accuracy' : train_score,
                     'train_f1' : train_f1,
                     'train_spec' : train_spec,
                     'train_sens' : train_sens,
                     'test_accuracy': test_score,
                     'test_f1': test_f1,
                     'test_spec' : test_spec,
                     'test_sens' : test_sens}
        
        # Add new row to models_df
        models_df = models_df.append(model_row, ignore_index=True)
        
        print(f'Done with {model}, moving on')
        
    return models_df

In [13]:
classification_models = [LogisticRegression(n_jobs=12),
                        DecisionTreeClassifier(max_depth=6),
                        BaggingClassifier(base_estimator = DecisionTreeClassifier(max_depth=6), 
                                          n_estimators=500, 
                                          n_jobs=12),
                        RandomForestClassifier(max_depth=6, 
                                               n_estimators=1000, 
                                               n_jobs=12, 
                                               random_state=42),
                        AdaBoostClassifier(n_estimators=350, 
                                           random_state=42),
                        VotingClassifier([
                                        ('logreg', LogisticRegression(n_jobs=12)),
                                         ('dt', DecisionTreeClassifier(max_depth=6)),
                                         ('bc', BaggingClassifier(base_estimator = DecisionTreeClassifier(max_depth=6), 
                                                           n_estimators=500, 
                                                           n_jobs=12)),
                                        ('rfc', RandomForestClassifier(max_depth=6, 
                                                               n_estimators=1000, 
                                                               n_jobs=12, 
                                                               random_state=42)),
                                        ('ab', AdaBoostClassifier(n_estimators=500, 
                                                           random_state=42))]),                                        
                        SVC(C=10, random_state=42)
                        ]

In [14]:
gen_model_df = model_score_classification(X, y, classification_models)

Done with LogisticRegression(n_jobs=12), moving on
Done with DecisionTreeClassifier(max_depth=6), moving on
Done with BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=6),
                  n_estimators=500, n_jobs=12), moving on
Done with RandomForestClassifier(max_depth=6, n_estimators=1000, n_jobs=12,
                       random_state=42), moving on
Done with AdaBoostClassifier(n_estimators=350, random_state=42), moving on
Done with VotingClassifier(estimators=[('logreg', LogisticRegression(n_jobs=12)),
                             ('dt', DecisionTreeClassifier(max_depth=6)),
                             ('bc',
                              BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=6),
                                                n_estimators=500, n_jobs=12)),
                             ('rfc',
                              RandomForestClassifier(max_depth=6,
                                                     n_estimators=1000,
      

In [15]:
gen_model_df

Unnamed: 0,model,parameters,train_accuracy,train_f1,train_spec,train_sens,test_accuracy,test_f1,test_spec,test_sens
0,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.847625,0.645855,0.874305,0.733463,0.848913,0.652542,0.877305,0.731013
1,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.855405,0.640976,0.866657,0.797167,0.857266,0.649366,0.869751,0.794682
2,BaggingClassifier,"{'base_estimator__ccp_alpha': 0.0, 'base_estim...",0.857494,0.646987,0.868317,0.801861,0.858494,0.654262,0.871473,0.794461
3,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.855201,0.620112,0.857333,0.842136,0.85788,0.631176,0.860712,0.841121
4,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.866544,0.688462,0.885081,0.786245,0.868198,0.696636,0.889092,0.78123
5,VotingClassifier,"{'estimators': [('logreg', LogisticRegression(...",0.858518,0.642303,0.865372,0.821069,0.860214,0.650706,0.868479,0.816641
6,SVC,"{'C': 10, 'break_ties': False, 'cache_size': 2...",0.868018,0.692256,0.886222,0.789416,0.850141,0.650029,0.874981,0.742464


Looks like all these models perform fairly well before GridSearching, so let's break these up and use `GridSearchCV` to find the best ones!

## Gridsearching through `AdaBoostClassifier`, `SVC`, and `GradientNB`

Thanks Eric Heidbreder!

In [17]:
# Need to split my data!
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [18]:
# Creating AdaBoost Pipeline
pipe_ab = Pipeline([
    ('sc', StandardScaler()),
    ('ab', AdaBoostClassifier()),
])

In [19]:
# Creating SVC Pipeline
pipe_svc = Pipeline([
    ('sc', StandardScaler()),
    ('svc', SVC())
])

In [20]:
# Gaussian Naive Bayes
pipe_gb = Pipeline([
    ('sc', StandardScaler()),
    ('gb', GaussianNB())
])

In [21]:
# Creating GaussianNB params
params_gb = {}

In [22]:
# Creating AdaBoost Params
params_ab = {
    'ab__n_estimators' : [2500, 3000],
    'ab__random_state' : [42]
}

In [23]:
# Creating SVC Params
params_svc = {
    'svc__C': [10],
    'svc__degree': [2]
}

In [24]:
# Creating AdaBoost GridSearch
grid_ab = GridSearchCV(pipe_ab, params_ab, cv=5, verbose=2, n_jobs=-1)

In [25]:
# Creating SVC GridSearch
grid_svc = GridSearchCV(pipe_svc, params_svc, cv=5, verbose=2, n_jobs=-1)

In [26]:
# Creating GaussianNB GridSearch
grid_gb = GridSearchCV(pipe_gb,
                     param_grid = params_gb,
                     cv = 5,
                     verbose = 1,
                     scoring = 'accuracy')

In [27]:
# Commented out so the .csv doesn't get overwritten
model_params = {}
count = 0

In [28]:
# Uncomment if you really want to run this GridSearch again, it will take awhile
grid_ab.fit(X_train, y_train)
grid_svc.fit(X_train, y_train)
grid_gb.fit(X_train, y_train)

# Create a new dictionary entry with the best params used in the GridSearch Pipeline
grid_ab.best_params_['best_params'] = grid_ab.best_params_
grid_svc.best_params_['best_params'] = grid_svc.best_params_
grid_gb.best_params_['best_params'] = grid_gb.best_params_

# Create a new dictionary entry with the model used in the GridSearch Pipeline
grid_ab.best_params_['model'] = grid_ab.estimator[1]
grid_svc.best_params_['model'] = grid_svc.estimator[1]
grid_gb.best_params_['model'] = grid_gb.estimator[1]

# Create a new dictionary entry with the cv score from the GridSearch
grid_ab.best_params_['cv_score'] = grid_ab.best_score_
grid_svc.best_params_['cv_score'] = grid_svc.best_score_
grid_gb.best_params_['cv_score'] = grid_gb.best_score_

# Create a new dictionary entry with the train score from the GridSearch
grid_ab.best_params_['train_score'] = grid_ab.score(X_train, y_train)
grid_svc.best_params_['train_score'] = grid_svc.score(X_train, y_train)
grid_gb.best_params_['train_score'] = grid_gb.score(X_train, y_train)

# Create a new dictionary entry with the test score from the GridSearch
grid_ab.best_params_['test_score'] = grid_ab.score(X_test, y_test)
grid_svc.best_params_['test_score'] = grid_svc.score(X_test, y_test)
grid_gb.best_params_['test_score'] = grid_gb.score(X_test, y_test)

# Add each of these entries to the list
count += 1
model_params[f'model_{count}'] = grid_ab.best_params_
count += 1
model_params[f'model_{count}'] = grid_svc.best_params_
count += 1
model_params[f'model_{count}'] = grid_gb.best_params_

# Create a DataFrame from the dictionary we created above
model_df = pd.DataFrame.from_dict(model_params, orient='index')

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   45.0s remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   50.9s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.6s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [29]:
model_df

Unnamed: 0,ab__n_estimators,ab__random_state,best_params,model,cv_score,train_score,test_score,svc__C,svc__degree
model_1,3000.0,42.0,"{'ab__n_estimators': 3000, 'ab__random_state':...",AdaBoostClassifier(),0.866585,0.868796,0.87262,,
model_2,,,"{'svc__C': 10, 'svc__degree': 2, 'best_params'...",SVC(),0.84783,0.868182,0.852844,10.0,2.0
model_3,,,{'best_params': {'best_params': {'best_params'...,GaussianNB(),0.818428,0.819943,0.825574,,


## GridSearching through `LogisticRegression`

Thanks Irene Anibogwu

In [30]:
lr_pipe = Pipeline([ 
    ('sc', StandardScaler()),
    ('lr', LogisticRegression(solver = 'liblinear')),  
])

lr_pipe.fit(X_train, y_train)

print(f'Training Accuracy: {lr_pipe.score(X_train, y_train)}')
print(f'Testing Accuracy: {lr_pipe.score(X_test, y_test)}')

Training Accuracy: 0.8466420966420967
Testing Accuracy: 0.8545633214592802


## GridSearching through `DecisionTreeClassifier` 

Thanks Irene Anibogwu!

In [31]:
# grid search for decision trees to find best estimator and params
gridcv = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [3, 5, 7, 10],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1,
                    n_jobs=-1)
gridcv.fit(X_train, y_train)
gridcv.best_estimator_
gridcv.best_params_

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 305 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 449 out of 480 | elapsed:    2.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    2.2s finished


{'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 5}

In [32]:
# Instantiate model w/ best parameters.
dt = DecisionTreeClassifier(max_depth = 7,
                            min_samples_split = 20,
                            min_samples_leaf = 4,
                            random_state = 42)
# Fit model.
dt.fit(X_train, y_train)
# Evaluate model.
print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

Score on training set: 0.8572481572481573
Score on testing set: 0.8595995577938829


## GridSearching through `RandomForestClassifier`

Thanks Josh Mizraji!

In [33]:
model_params = {}
count = 0

In [34]:
#Scaffolding
params_rf = {
    'n_estimators' : [55,60,70], #number of trees
    'max_features' : [None], 
    'max_depth' : [7,8,9]
}
#Instantiate Gridsearch
gs_rf = GridSearchCV(RandomForestClassifier(), 
                 param_grid=params_rf,
                 cv=5,
                n_jobs=-1)
#Fit
gs_rf.fit(X_train, y_train)
#this takes the best params dictionary and adds a column called score
gs_rf.best_params_['score'] = gs_rf.best_score_
#make a counter
count +=1
#create new column with best params
model_params[f'model_{count}'] = gs_rf.best_params_
#orient sideways
model_df = pd.DataFrame.from_dict(model_params, orient='index')
model_df
#adapted from DSI lesson 

Unnamed: 0,max_depth,max_features,n_estimators,score
model_1,9,,55,0.857985


## GridSearching through `BaggingClassifier`

Thanks Juhee Sung-Schenck!

In [35]:
# Building pipeline for BaggingClassifier
pipe_bag = Pipeline([
    ('sc', StandardScaler()),
    ('bag', BaggingClassifier(base_estimator = DecisionTreeClassifier(max_depth = 6)))
])

params = {
    'bag__n_estimators': [1000],
    'bag__max_samples': [300],
    'bag__max_features': [18]
}

gs_bag = GridSearchCV(pipe_bag,
                      param_grid = params,
                      cv = 3,
                      verbose = 1,
                     n_jobs=-1)

# fit
gs_bag.fit(X_train, y_train)

# score
print(f'Training Score: {gs_bag.score(X_train, y_train)}')
print(f'TestingScore: {gs_bag.score(X_test, y_test)}')

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.0s finished


Training Score: 0.8540540540540541
TestingScore: 0.8578798673381648


## Running `VotingClassifier` on our best models as determined by `GridSearchCV`

Thanks Eric Heidbreder!

In [36]:
vc = VotingClassifier([
    ('ab', AdaBoostClassifier(n_estimators=2500, random_state=42)),
    ('bag', BaggingClassifier(n_estimators=2000,
                             max_samples=300,
                             max_features=len(features))),
    ('rf', RandomForestClassifier(max_depth=9,
                                 n_estimators=70)),
    ('dt', DecisionTreeClassifier(max_depth = 7,
                                  min_samples_split = 20,
                                  min_samples_leaf = 4,
                                  random_state = 42)),
], n_jobs=12)

In [37]:
vc.fit(X_train, y_train)

VotingClassifier(estimators=[('ab',
                              AdaBoostClassifier(n_estimators=2500,
                                                 random_state=42)),
                             ('bag',
                              BaggingClassifier(max_features=21,
                                                max_samples=300,
                                                n_estimators=2000)),
                             ('rf',
                              RandomForestClassifier(max_depth=9,
                                                     n_estimators=70)),
                             ('dt',
                              DecisionTreeClassifier(max_depth=7,
                                                     min_samples_leaf=4,
                                                     min_samples_split=20,
                                                     random_state=42))],
                 n_jobs=12)

In [38]:
# Training Score
vc.score(X_train, y_train)

0.864004914004914

In [39]:
# Testing Score
vc.score(X_test, y_test)

0.8638987839331778

## Prepping test data

In [43]:
# Reading in test data
income_test = pd.read_csv('./data/test_data.csv')

In [44]:
# Replace the ? with 'unknown'
income_test.replace(' ?', "unknown", inplace = True )
cat_columns_test = income_test.drop(columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'], axis = 1).columns

# Stripping whitespace from beginning of each value
for column in cat_columns_test:
    income_test[column] = income_test[column].apply(lambda x: x.strip())

### Binarize the `sex` column

In [3]:
income_test['sex'] = np.where(income_test['sex'] == 'Male', 1, 0)

### Converting `fnlwgt` to a sample weight

In [4]:
# Creating a sample weight column, thanks Eric Heidbreder!
income_test['smpl_wgt'] = income_test['fnlwgt'].apply(lambda x: x / income_test['fnlwgt'].sum())

We tested the sample weight out with our models later, and it wasn't very good, so we abandoned this column eventually.

### Creating log_age column

Age benefited from a log transform, which converted it to a more normal distribution

In [5]:
# Log age, thanks Eric Heidbreder!
income_test['log_age'] = np.log(income_test['age'])

### Dummifying Features

In [57]:
# Dummifying Features, thanks Juhee Sung-Schenck!
income_test_d = pd.get_dummies(columns = ['workclass', 'education', 'marital-status', 'occupation', 'sex', 'native-country'], data=income_test, drop_first = True)

## Getting predictions for Testing Data

In [70]:
# Selecting Features
features = [column for column in dummy_income.columns if column in income_test_d.columns] # Selects only the columns that are in income_test
X = dummy_income[features]
y = orig_income['wage']

In [71]:
ab = AdaBoostClassifier(n_estimators=2500, random_state=42)

In [72]:
ab.fit(X, y) # Training the model on the full dataset

AdaBoostClassifier(n_estimators=2500, random_state=42)

In [73]:
preds = ab.predict(income_test_d[features])

In [74]:
preds.shape

(16281,)

In [75]:
preds_df = pd.DataFrame(preds, columns=['wage'])

In [69]:
preds_df.to_csv('./data/submission.csv', index=False)