In [1]:
# import libraries
from numpy import mean
from numpy import std
from numpy import hstack
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

# reading csv files
df =  pd.read_csv('adult.data', sep=",", header=None, skipinitialspace=True)
df2 = pd.read_csv('adult.test', sep=",", header=None, skipinitialspace=True)

# Join the data and test files together
df = pd.concat([df, df2])

# Shuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

# Replace all of ? with None
df = df.replace(['?'], [None])
# Drop all rows with None in them
df = df.dropna(axis=0)

# Check no None values remain
df.isnull().sum()

#Adding column headers to our data 
df.columns = ["Age", "Workclass", "Fnlwgt", "Education", "Education-num", "Marital-status", "Occupation", "Relationship", "Race", "Sex", "Capital-gain", "Capital-loss", "Hours-per-week", "Native-country", "Income"]
# Workclass, Fnlwgt, Race and Native-country are not worth using.
# Education = Education num, so drop Education
df = df.drop(columns=['Workclass', 'Race', 'Fnlwgt', 'Native-country', 'Education'])


In [2]:
#Numerically encoding occupation: Occupaiton is grouped into white collar and blue collar 
occupation_mapping_dict = {
    "Tech-support" : 0,
    "Craft-repair" : 1, 
    "Other-service" : 1, #Wasn't sure about blue or white collar for other services 
    "Sales" : 0, 
    "Exec-managerial" : 0, 
    "Prof-specialty" : 0, 
    "Handlers-cleaners" : 1, 
    "Machine-op-inspct" : 1, 
    "Adm-clerical" : 0, 
    "Farming-fishing" : 1, 
    "Transport-moving" : 1, 
    "Priv-house-serv" : 1, 
    "Protective-serv" : 1, 
    "Armed-Forces" : 1
    }

df["Occupation"] = df["Occupation"].map(occupation_mapping_dict)


#Numerically encoding the sex variable 
sex_mapping_dict = {
    "Male" : 0,
    "Female" : 1
    }

df["Sex"] = df["Sex"].map(sex_mapping_dict)


#Encoding income variable
income_mapping_dict = {
    "<=50K" : 0,
    ">50K" : 1, 
    "<=50K." : 0, 
    ">50K." : 1
    }

df["Income"] = df["Income"].map(income_mapping_dict)


# FOR MODELS
# Group ages into discrete bins for models
bins = [10,20,30,40,50,60,70,80,90]
names = ['0', '1', '2', '3', '4', '5', '6', '7']
df['Age'] = pd.cut(df['Age'], bins, labels = names)



df

Unnamed: 0,Age,Education-num,Marital-status,Occupation,Relationship,Sex,Capital-gain,Capital-loss,Hours-per-week,Income
0,3,9,Divorced,1,Not-in-family,0,0,0,50,0
1,1,13,Married-civ-spouse,0,Husband,0,0,0,40,0
2,1,9,Never-married,1,Not-in-family,0,0,0,35,0
3,3,10,Divorced,1,Not-in-family,0,0,0,45,0
4,2,11,Never-married,0,Unmarried,1,0,0,55,0
...,...,...,...,...,...,...,...,...,...,...
48837,1,6,Divorced,1,Not-in-family,0,0,0,40,0
48838,1,10,Never-married,0,Own-child,1,0,0,40,0
48839,3,10,Divorced,1,Unmarried,1,0,0,40,0
48840,0,5,Never-married,1,Own-child,0,0,0,28,0


In [3]:
#Will now employ one-hot encoding for :  Marital Status and Relationship ; no order in their values
df = pd.get_dummies(df, columns = ['Relationship', 'Marital-status'])

In [4]:
df['Capital-gain'].value_counts()
# 229 values have 99,999 which is ALOT. Is this cap value? or error?

0        41432
15024      498
7688       391
7298       351
99999      229
         ...  
7262         1
1731         1
22040        1
1639         1
2387         1
Name: Capital-gain, Length: 121, dtype: int64

In [5]:
# Will now do feature scaling on Capital-gaine and Capital-loss
col_names = ['Capital-gain', 'Capital-loss']
features = df[col_names]

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df[col_names] = scaler.fit_transform(features.values)
df


Unnamed: 0,Age,Education-num,Occupation,Sex,Capital-gain,Capital-loss,Hours-per-week,Income,Relationship_Husband,Relationship_Not-in-family,...,Relationship_Own-child,Relationship_Unmarried,Relationship_Wife,Marital-status_Divorced,Marital-status_Married-AF-spouse,Marital-status_Married-civ-spouse,Marital-status_Married-spouse-absent,Marital-status_Never-married,Marital-status_Separated,Marital-status_Widowed
0,3,9,1,0,-0.146733,-0.21878,50,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,1,13,0,0,-0.146733,-0.21878,40,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,1,9,1,0,-0.146733,-0.21878,35,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,3,10,1,0,-0.146733,-0.21878,45,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,2,11,0,1,-0.146733,-0.21878,55,0,0,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,1,6,1,0,-0.146733,-0.21878,40,0,0,1,...,0,0,0,1,0,0,0,0,0,0
48838,1,10,0,1,-0.146733,-0.21878,40,0,0,0,...,1,0,0,0,0,0,0,1,0,0
48839,3,10,1,1,-0.146733,-0.21878,40,0,0,0,...,0,1,0,1,0,0,0,0,0,0
48840,0,5,1,0,-0.146733,-0.21878,28,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [6]:
# Split X and y
X = df.iloc[:, [0,1,2, 3, 4, 5, 6,8,9,10,11,12,13,14,15,16,17,18,19,20]]
y = df.iloc[:, [7]]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.20)

In [7]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a Classifier
# model=RandomForestClassifier(n_estimators=70, min_samples_split=10, min_samples_leaf=2, max_features='log2', max_depth=20, bootstrap=True)
# model=RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=4, max_features='sqrt', max_depth=30, bootstrap=False)
# model=RandomForestClassifier(n_estimators=1700, min_samples_split=10, min_samples_leaf=2, max_features='auto', max_depth=20, bootstrap=True)
# model=RandomForestClassifier(n_estimators=1000, min_samples_split=10, min_samples_leaf=4, max_features='auto', max_depth=20, bootstrap=True)
# model=RandomForestClassifier(n_estimators=1000, min_samples_split=12, min_samples_leaf=3, max_features='auto', max_depth=18, bootstrap=True)
model=RandomForestClassifier(n_estimators=800, min_samples_split=12, min_samples_leaf=1, max_features='auto', max_depth=16, bootstrap=True)


# model=RandomForestClassifier(n_estimators=100)

model.get_params()

# {'n_estimators': 1700,
#  'min_samples_split': 10,
#  'min_samples_leaf': 2,
#  'max_features': 'log2',
#  'max_depth': 20,
#  'bootstrap': True}
# 0.8586947508085248
# 0.8586394670647096


# No hyperparameters:
# 0.8521436271664317
# 0.8525858971169528
# 0.8488542444094314
# 0.8519224921911712


# {'n_estimators': 1000,
#  'min_samples_split': 2,
#  'min_samples_leaf': 4,
#  'max_features': 'sqrt',
#  'max_depth': 30,
#  'bootstrap': False}
# 0.8588882439118777


# {'bootstrap': True,
#  'max_depth': 20,
#  'max_features': 'auto',
#  'min_samples_leaf': 2,
#  'min_samples_split': 10,
#  'n_estimators': 1700}
# 0.858805318296155


# {'bootstrap': True,
#  'max_depth': 20,
#  'max_features': 'auto',
#  'min_samples_leaf': 4,
#  'min_samples_split': 10,
#  'n_estimators': 1000}
# 0.8599662769162728


# {'bootstrap': True,
#  'max_depth': 18,
#  'max_features': 'auto',
#  'min_samples_leaf': 3,
#  'min_samples_split': 12,
#  'n_estimators': 1000}
# 0.8579484202670205


# {'bootstrap': True,
#  'max_depth': 16,
#  'max_features': 'auto',
#  'min_samples_leaf': 1,
#  'min_samples_split': 12,
#  'n_estimators': 800}
# 0.8580037040108356


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 16,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 12,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 800,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [8]:
model.fit(X_train, y_train.ravel())

  model.fit(X_train, y_train)


RandomForestClassifier(max_depth=16, min_samples_split=12, n_estimators=800)

In [9]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
train_data_predictions = cross_val_predict(model, X_train, y_train.ravel(), cv=20)

In [10]:
print(accuracy_score(y_train, train_data_predictions))

0.8580037040108356


In [13]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [None]:
# Therefore, best hyperparameters are:
# {'bootstrap': True,
#  'max_depth': 20,
#  'max_features': 'auto',
#  'min_samples_leaf': 4,
#  'min_samples_split': 10,
#  'n_estimators': 1000}
# These are rhe final hyperparameters we will use in our final evaluation for random forest.