In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

income = pd.read_csv("../Data/income.csv")

income.head()

Unnamed: 0,age,JobType,EdType,maritalstatus,occupation,relationship,race,gender,capitalgain,capitalloss,hoursperweek,nativecountry,SalStat
0,45,Private,HS-grad,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,28,United-States,"less than or equal to 50,000"
1,24,Federal-gov,HS-grad,Never-married,Armed-Forces,Own-child,White,Male,0,0,40,United-States,"less than or equal to 50,000"
2,44,Private,Some-college,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,"greater than 50,000"
3,27,Private,9th,Never-married,Craft-repair,Other-relative,White,Male,0,0,40,Mexico,"less than or equal to 50,000"
4,20,Private,Some-college,Never-married,Sales,Not-in-family,White,Male,0,0,35,United-States,"less than or equal to 50,000"


In [59]:
def income_model_prep(data):
    from sklearn.model_selection import train_test_split
    data = pd.get_dummies(
        data.assign(
            target = np.where(data["SalStat"] == " less than or equal to 50,000", 0, 1),
            nativecountry = data["nativecountry"].replace({" Holand-Netherlands": " Germany"}),
            occupation = data["occupation"].replace({" Armed-Forces": " ?"}),
            JobType = data["JobType"].replace({" Never-worked": " Without-pay"}),
        ).drop(["SalStat", "nativecountry", "race"], axis=1), 
        drop_first=True
    )
    X = data.drop("target", axis=1)
    y = data["target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = income_model_prep(income)

## Assignment 1: Simple Ensemble

1. Use the VotingClassifier function to create an ensemble of the three models below.
2. Compare the accuracy of using hard vs. soft scores in this ensemble

In [3]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()

X_train_std = std.fit_transform(X_train)
X_test_std = std.transform(X_test)

In [5]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=0.1, penalty="l2", solver="saga", max_iter=1000)

lr = logreg.fit(X_train_std, y_train)

print(f"Train Accuracy: {lr.score(X_train_std, y_train)}")
print(f"Test Accuracy: {lr.score(X_test_std, y_test)}")

Train Accuracy: 0.8522398561488547
Test Accuracy: 0.8567854909318324


In [6]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train_std, y_train)

print(f"Train Accuracy: {knn.score(X_train_std, y_train)}")
print(f"Test Accuracy: {knn.score(X_test_std, y_test)}")

Train Accuracy: 0.8412164803377374
Test Accuracy: 0.8281738586616635


In [7]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_leaf=25)

dt.fit(X_train_std, y_train)

print(f"Train Accuracy: {dt.score(X_train_std, y_train)}")
print(f"Test Accuracy: {dt.score(X_test_std, y_test)}")

Train Accuracy: 0.8614650926432648
Test Accuracy: 0.8610068792995622


## Assignment 2: Random Forest

1. Fit a Random Forest model using the default hyperparameters. Only set a random state.
2. Then, perform hyperparameter tuning for your Random Forest.
3. Compare your tuned model's AUC score with your untuned model. 

## Assignment 3: Gradient Boosting

1. Fit and Tune a GBM
2. Plot Feature Importance for your Tuned GBM - could we remove any features?
2. Plot the tuned GBM ROC curve vs your tuned Random Forest model and report AUC for each