In [2]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt


In [3]:
names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
df = pd.read_csv("adult.data", names=names, skipinitialspace=True)
df_test = pd.read_csv("adult.test", names=names, skipinitialspace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
#count and replace the ? entries
for column in df.columns:
    print(column + ": {}".format((df[column] == '?').sum()))
df[df == '?'] = np.nan
df_test[df_test == '?'] = np.nan

age: 0
workclass: 1836
fnlwgt: 0
education: 0
education-num: 0
marital-status: 0
occupation: 1843
relationship: 0
race: 0
sex: 0
capital-gain: 0
capital-loss: 0
hours-per-week: 0
native-country: 583
income: 0


  result = method(y)


In [6]:
#drop rows with missing values
print("old len: {}".format(df.shape[0]))
df = df.dropna()
print("new len: {}".format(df.shape[0]))

df_test = df_test.dropna()

old len: 32561
new len: 30162


In [7]:
#make the income numerical
# print((df["income"] == ' <=50K').sum())
df["income"] = df["income"].map({"<=50K":0, ">50K":1})
df_test["income"] = df_test["income"].map({"<=50K.":0, ">50K.":1})
#drop fnlweight as it is only interesting for the census authorities
df = df.drop(labels="fnlwgt", axis=1)
df_test = df_test.drop(labels="fnlwgt", axis=1)


In [8]:
for column in df.columns:
    print(df[column].value_counts())

36    852
31    851
33    837
34    836
35    828
     ... 
82      7
83      5
88      3
85      3
86      1
Name: age, Length: 72, dtype: int64
Private             22286
Self-emp-not-inc     2499
Local-gov            2067
State-gov            1279
Self-emp-inc         1074
Federal-gov           943
Without-pay            14
Name: workclass, dtype: int64
HS-grad         9840
Some-college    6678
Bachelors       5044
Masters         1627
Assoc-voc       1307
11th            1048
Assoc-acdm      1008
10th             820
7th-8th          557
Prof-school      542
9th              455
12th             377
Doctorate        375
5th-6th          288
1st-4th          151
Preschool         45
Name: education, dtype: int64
9     9840
10    6678
13    5044
14    1627
11    1307
7     1048
12    1008
6      820
4      557
15     542
5      455
8      377
16     375
3      288
2      151
1       45
Name: education-num, dtype: int64
Married-civ-spouse       14065
Never-married             9726
Divo

The cells education and education-num are equal, which means we can drop the non numerical education column.

In [9]:
df = df.drop(labels="education", axis=1)
df_test = df_test.drop(labels="education", axis=1)

In [10]:
# #visualization of each column in frequency
# for column in df.columns:
#     values = df[column].value_counts().sort_index()
#     categories = df[column].unique()
#     ax = values.plot.bar(title=column, figsize=(12,6))
#     ax.set_ylabel("Frequency", fontsize=16)
#     plt.show()

In [11]:
#drop fnlweight as it is only interesting for the census authorities
#df = df.drop(labels="native-country", axis=1)

In [12]:
#split the data for the pca and further processing
# from sklearn.model_selection import train_test_split

train_x = df.drop(labels="income", axis=1)
train_y = df["income"]
test_x = df_test.drop(labels="income", axis=1)
test_y = df_test["income"]

# train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size=2/10, random_state=0)
# print(train_x.shape)
# print(test_x.shape)

In [13]:
#convert the categorical data to numerical
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
features_to_encode = ["workclass", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
for feature in features_to_encode:
    train_x[feature] = le.fit_transform(train_x[feature])
    test_x[feature] = le.fit_transform(test_x[feature])

train_x.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,5,13,4,0,1,4,1,2174,0,40,38
1,50,4,13,2,3,0,4,1,0,0,13,38
2,38,2,9,0,5,1,4,1,0,0,40,38
3,53,2,7,2,5,0,2,1,0,0,40,38
4,28,2,13,2,9,5,2,0,0,0,40,4


In [14]:
#perform pca to see if we can drop even more columns which might be correlated. 
#martial status and relationship seem pretty similar to begin with
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#use labelencoder to encode the string labels to int


#scales the data to the standard normal, as is required for pca of sklearn
scaler = StandardScaler()
scaler.fit(train_x)
train_x_norm = scaler.transform(train_x)
test_x_norm = scaler.transform(test_x)

pca = PCA(0.75)
pca.fit(train_x_norm)

train_x_pca = pca.transform(train_x_norm)
test_x_pca = pca.transform(test_x_norm)
print(train_x_pca.shape)


(30162, 8)


In [15]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=0, n_estimators = 100, n_jobs = -1, )

model.fit(train_x, train_y)
model.score(test_x, test_y)

0.8474103585657371

In [57]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [50] #This should always be best at higher numbers. Using low number so we can test many of the other hyperparams

# criterion = ['gini', 'entropy']
# max_depth = [None, 10, 25, 50]
# min_samples_split = [2, 5, 10, 25]
# min_samples_leaf = [1, 2, 5, 10, 25]
# min_weight_fraction_leaf = [0, 0.1, 0.25, 0.5, 0.75, 0.9]
# max_features = ['auto', 'sqrt', 'log2', None]
# max_leaf_nodes = [None, 1, 2, 5, 10, 25]
# min_impurity_decrease = [0, 0.1, 0.25, 0.5, 0.75, 0.9]
# # min_impurity_split = [] Deprecated 
# bootstrap = [True, False]
# oob_score = [True, False]
# class_weight = [None, 'balanced', 'balanced_subsample']
# ccp_alpha = [0, 0.1, 0.25, 0.5, 0.75, 0.9]
# max_samples = [None, 0.1, 0.25, 0.5, 0.75, 0.9]

# After a few runs we narrowed down the hyper params
criterion = ['gini', 'entropy']
max_depth = [None, 1, 2, 5, 10, 25]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 5, 10]
min_weight_fraction_leaf = [0, 0.1, 0.05]
max_features = ['auto']
max_leaf_nodes = [None, 1, 2, 5, 10, 25]
min_impurity_decrease = [0, 0.05, 0.1]
# min_impurity_split = [] Deprecated 
bootstrap = [True, False]
oob_score = [False]
class_weight = [None, 'balanced', 'balanced_subsample']
ccp_alpha = [0, 0.1, 0.05]
max_samples = [None, 0.1, 0.25, 0.5, 0.75, 0.9]

hyperparams = {
    'n_estimators': n_estimators,
    'criterion': criterion,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'min_weight_fraction_leaf': min_weight_fraction_leaf,
    'max_features': max_features,
    'max_leaf_nodes': max_leaf_nodes,
    'min_impurity_decrease': min_impurity_decrease,
    'bootstrap': bootstrap,
    'oob_score': oob_score,
    'class_weight': class_weight,
    'ccp_alpha': ccp_alpha,
    'max_samples': max_samples
}

base_model = RandomForestClassifier()
optimal_model = RandomizedSearchCV(estimator = base_model, param_distributions = hyperparams, n_iter = 1000, cv = 2, verbose=3, random_state=0, n_jobs = -1)
optimal_model.fit(train_x, train_y)
print(optimal_model.best_params_)

Fitting 2 folds for each of 1000 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  1.9min finished


{'oob_score': False, 'n_estimators': 50, 'min_weight_fraction_leaf': 0, 'min_samples_split': 10, 'min_samples_leaf': 1, 'min_impurity_decrease': 0, 'max_samples': 0.5, 'max_leaf_nodes': None, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'class_weight': None, 'ccp_alpha': 0, 'bootstrap': False}


In [58]:
# print(optimal_model.best_estimator_.score(test_x, test_y))
test = RandomForestClassifier(**optimal_model.best_params_)
test.set_params(n_estimators=500)
test.fit(train_x, train_y)
# test.get_params()
test.score(test_x, test_y)

0.8528552456839309

In [67]:
opt_model = RandomForestClassifier(n_estimators = 250, min_samples_split = 2, min_samples_leaf = 4, max_features = 'auto', max_depth = 20, bootstrap = False, n_jobs=-1)
# opt_model.get_params()
opt_model.fit(train_x, train_y)
opt_model.score(test_x, test_y)

0.8606905710491368

In [44]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=80, random_state=0)
ada.fit(train_x, train_y)
ada.score(test_x, test_y)

0.8689243027888446

In [60]:
ada2 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2, max_leaf_nodes=10), n_estimators=300, random_state=0, learning_rate=1)
ada2.fit(train_x, train_y)
ada2.score(test_x, test_y)

0.8639442231075697

In [79]:
ada3 = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators = 250, min_samples_split = 2, min_samples_leaf = 4, max_features = 'auto', max_depth = 20, bootstrap = False, n_jobs=-1), n_estimators=4, random_state=0, learning_rate=1)
ada3.fit(train_x, train_y)
ada3.score(test_x, test_y)


0.8651394422310758

In [80]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(train_x, train_y)
svm.score(test_x, test_y)

0.7964143426294821

In [108]:
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(loss="exponential", learning_rate=.2, subsample=1, n_estimators=200)
GBC.fit(train_x, train_y)
GBC.score(test_x, test_y)

0.8704515272244356

In [134]:
from sklearn.ensemble import BaggingClassifier
bag = BaggingClassifier(n_estimators=100, n_jobs=-1, max_samples=0.20, max_features=0.65, oob_score=True)
bag.fit(train_x, train_y)
bag.score(test_x, test_y)

0.8626826029216468