In [1]:
# import libraries
from numpy import mean
from numpy import std
from numpy import hstack
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

# reading csv files
df =  pd.read_csv('adult.data', sep=",", header=None, skipinitialspace=True)
df2 = pd.read_csv('adult.test', sep=",", header=None, skipinitialspace=True)

# Join the data and test files together
df = pd.concat([df, df2])

# Shuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

# Replace all of ? with None
df = df.replace(['?'], [None])
# Drop all rows with None in them
df = df.dropna(axis=0)

# Check no None values remain
df.isnull().sum()

#Adding column headers to our data 
df.columns = ["Age", "Workclass", "Fnlwgt", "Education", "Education-num", "Marital-status", "Occupation", "Relationship", "Race", "Sex", "Capital-gain", "Capital-loss", "Hours-per-week", "Native-country", "Income"]
# Workclass, Fnlwgt, Race and Native-country are not worth using.
# Education = Education num, so drop Education
df = df.drop(columns=['Workclass', 'Race', 'Fnlwgt', 'Native-country', 'Education'])


In [2]:
#Numerically encoding occupation: Occupaiton is grouped into white collar and blue collar 
occupation_mapping_dict = {
    "Tech-support" : 0,
    "Craft-repair" : 1, 
    "Other-service" : 1, #Wasn't sure about blue or white collar for other services 
    "Sales" : 0, 
    "Exec-managerial" : 0, 
    "Prof-specialty" : 0, 
    "Handlers-cleaners" : 1, 
    "Machine-op-inspct" : 1, 
    "Adm-clerical" : 0, 
    "Farming-fishing" : 1, 
    "Transport-moving" : 1, 
    "Priv-house-serv" : 1, 
    "Protective-serv" : 1, 
    "Armed-Forces" : 1
    }

df["Occupation"] = df["Occupation"].map(occupation_mapping_dict)


#Numerically encoding the sex variable 
sex_mapping_dict = {
    "Male" : 0,
    "Female" : 1
    }

df["Sex"] = df["Sex"].map(sex_mapping_dict)


#Encoding income variable
income_mapping_dict = {
    "<=50K" : 0,
    ">50K" : 1, 
    "<=50K." : 0, 
    ">50K." : 1
    }

df["Income"] = df["Income"].map(income_mapping_dict)


# FOR MODELS
# Group ages into discrete bins for models
bins = [10,20,30,40,50,60,70,80,90]
names = ['0', '1', '2', '3', '4', '5', '6', '7']
df['Age'] = pd.cut(df['Age'], bins, labels = names)



df

Unnamed: 0,Age,Education-num,Marital-status,Occupation,Relationship,Sex,Capital-gain,Capital-loss,Hours-per-week,Income
0,2,14,Never-married,0,Not-in-family,0,6849,0,40,0
1,0,10,Never-married,1,Own-child,0,0,0,20,0
2,2,13,Widowed,1,Not-in-family,1,0,0,30,0
3,2,10,Married-civ-spouse,1,Husband,0,0,0,32,0
4,2,9,Separated,1,Unmarried,1,0,0,40,0
...,...,...,...,...,...,...,...,...,...,...
48836,3,11,Married-civ-spouse,0,Husband,0,0,0,50,0
48838,1,9,Never-married,1,Other-relative,0,0,0,40,0
48839,3,13,Never-married,0,Not-in-family,1,0,1741,40,0
48840,0,5,Never-married,1,Own-child,0,0,0,16,0


In [3]:
#Will now employ one-hot encoding for :  Marital Status and Relationship ; no order in their values
df = pd.get_dummies(df, columns = ['Relationship', 'Marital-status'])

In [4]:
df['Capital-gain'].value_counts()
# 229 values have 99,999 which is ALOT. Is this cap value? or error?

0        41432
15024      498
7688       391
7298       351
99999      229
         ...  
7262         1
1731         1
22040        1
1639         1
2387         1
Name: Capital-gain, Length: 121, dtype: int64

In [5]:
# Will now do feature scaling on Capital-gaine and Capital-loss
col_names = ['Capital-gain', 'Capital-loss']
features = df[col_names]

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df[col_names] = scaler.fit_transform(features.values)
df


Unnamed: 0,Age,Education-num,Occupation,Sex,Capital-gain,Capital-loss,Hours-per-week,Income,Relationship_Husband,Relationship_Not-in-family,...,Relationship_Own-child,Relationship_Unmarried,Relationship_Wife,Marital-status_Divorced,Marital-status_Married-AF-spouse,Marital-status_Married-civ-spouse,Marital-status_Married-spouse-absent,Marital-status_Never-married,Marital-status_Separated,Marital-status_Widowed
0,2,14,0,0,0.765695,-0.218780,40,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0,10,1,0,-0.146733,-0.218780,20,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,2,13,1,1,-0.146733,-0.218780,30,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,2,10,1,0,-0.146733,-0.218780,32,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,2,9,1,1,-0.146733,-0.218780,40,0,0,0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,3,11,0,0,-0.146733,-0.218780,50,0,1,0,...,0,0,0,0,0,1,0,0,0,0
48838,1,9,1,0,-0.146733,-0.218780,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
48839,3,13,0,1,-0.146733,4.080499,40,0,0,1,...,0,0,0,0,0,0,0,1,0,0
48840,0,5,1,0,-0.146733,-0.218780,16,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [6]:
# Split X and y
X = df.iloc[:, [0,1,2, 3, 4, 5, 6,8,9,10,11,12,13,14,15,16,17,18,19,20]]
y = df.iloc[:, [7]]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.20)

In [7]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a Classifier
model=RandomForestClassifier(n_estimators=100)

model.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [8]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model (does this for each hyperparameter set)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [None]:
rf_random.best_params_

In [None]:
# {'n_estimators': 200,
#  'min_samples_split': 10,
#  'min_samples_leaf': 2,
#  'max_features': 'sqrt',
#  'max_depth': 50,
#  'bootstrap': True}

# {'n_estimators': 583,
#  'min_samples_split': 5,
#  'min_samples_leaf': 4,
#  'max_features': 'auto',
#  'max_depth': 30,
#  'bootstrap': False}

# {'n_estimators': 1700,
#  'min_samples_split': 10,
#  'min_samples_leaf': 2,
#  'max_features': 'log2',
#  'max_depth': 20,
#  'bootstrap': True}
# .856898029134533 

# {'n_estimators': 1000,
#  'min_samples_split': 2,
#  'min_samples_leaf': 4,
#  'max_features': 'sqrt',
#  'max_depth': 30,
#  'bootstrap': False}
# 0.8589158857837853

# {'n_estimators': 1700,
#  'min_samples_split': 10,
#  'min_samples_leaf': 2,
#  'max_features': 'log2',
#  'max_depth': 20,
#  'bootstrap': True}
# 0.8592199463747685

# {'n_estimators': 1000,
#  'min_samples_split': 2,
#  'min_samples_leaf': 4,
#  'max_features': 'sqrt',
#  'max_depth': 30,
#  'bootstrap': False}
# 0.8590264532714156

# {'n_estimators': 600,
#  'min_samples_split': 5,
#  'min_samples_leaf': 4,
#  'max_features': 'auto',
#  'max_depth': 70,
#  'bootstrap': True}
# 0.8591370207590459

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
best_random = rf_random.best_estimator_
train_data_predictions = cross_val_predict(best_random, X_train, y_train.ravel(), cv=3)
# train_data_predictions

# Use train because it already sets a validation set. We will test our model after tuning parameters later on using test sets.

from sklearn.metrics import confusion_matrix
# Show confusion matrix for our classifier’s predictions
confusion_matrix(y_train, train_data_predictions)

In [None]:
print(accuracy_score(y_train, train_data_predictions))

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
train_data_predictions = cross_val_predict(base_model, X_train, y_train.ravel(), cv=3)
# train_data_predictions

# Use train because it already sets a validation set. We will test our model after tuning parameters later on using test sets.

from sklearn.metrics import confusion_matrix
# Show confusion matrix for our classifier’s predictions
confusion_matrix(y_train, train_data_predictions)

In [None]:
print(accuracy_score(y_train, train_data_predictions))

In [None]:
# # This only works for regression!! SO CANNOT USE SINCE WE ARE DOING CLASSIFICATION.
# def evaluate(model, test_features, test_labels):
#     predictions = model.predict(test_features)
#     errors = abs(predictions - test_labels)
#     mape = 100 * np.mean(errors / test_labels)
#     accuracy = 100 - mape
#     print('Model Performance')
#     print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
#     print('Accuracy = {:0.2f}%.'.format(accuracy))
    
#     return accuracy
# base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
# base_model.fit(X_train, y_train)
# base_accuracy = evaluate(base_model, X_train, y_train)

# best_random = rf_random.best_estimator_
# random_accuracy = evaluate(best_random, X_train, y_train)

# print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))


In [None]:
# Email professor, asking if he can give template/format for report and also an example from previous year.

In [8]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [14, 16, 18],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [12, 14, 16],
    'n_estimators': [800, 1000, 1200]
}

# Create a base model
rf = RandomForestClassifier(random_state = 42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True)


In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [None]:
# param_grid = {
#     'bootstrap': [True, False],
#     'max_depth': [20, 30, 40, 50],
#     'max_features': ['auto'],
#     'min_samples_leaf': [2, 3, 4, 5],
#     'min_samples_split': [2, 5, 10],
#     'n_estimators': [600, 900, 1000, 1700]
# }

# Fitting 3 folds for each of 384 candidates, totalling 1152 fits

# {'bootstrap': True,
#  'max_depth': 20,
#  'max_features': 'auto',
#  'min_samples_leaf': 2,
#  'min_samples_split': 10,
#  'n_estimators': 1700}
# 0.858805318296155






In [None]:
# param_grid = {
#     'bootstrap': [True, False],
#     'max_depth': [20, 30],
#     'max_features': ['auto', 'sqrt'],
#     'min_samples_leaf': [2, 4],
#     'min_samples_split': [2, 10],
#     'n_estimators': [1000, 1700]
# }

# Fitting 3 folds for each of 64 candidates, totalling 192 fits

# {'bootstrap': True,
#  'max_depth': 20,
#  'max_features': 'auto',
#  'min_samples_leaf': 4,
#  'min_samples_split': 10,
#  'n_estimators': 1000}

In [None]:
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [18, 20, 22],
#     'max_features': ['auto'],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [1000]
# }

# Fitting 3 folds for each of 27 candidates, totalling 81 fits

# {'bootstrap': True,
#  'max_depth': 18,
#  'max_features': 'auto',
#  'min_samples_leaf': 3,
#  'min_samples_split': 12,
#  'n_estimators': 1000}

In [None]:
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [14, 16, 18],
#     'max_features': ['auto'],
#     'min_samples_leaf': [1, 2, 3],
#     'min_samples_split': [12, 14, 16],
#     'n_estimators': [800, 1000, 1200]
# }

# Fitting 3 folds for each of 81 candidates, totalling 243 fits

