In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import itertools
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from auxiliar_func import *
from plot_func import *

In [2]:
df = pd.read_csv('Census-Income-KDD.csv')
target = 'income_50k'
df_tr, df_te = train_test_split(df, test_size=0.3, random_state=42)

## Finding the best preprocessing parameters

In [None]:
params = {
    'scaling': [None],
    'imputation': ['mode'],
    'cat_age': [False],
    'target_freq': [0.8]
}

# rf = RandomForestClassifier(random_state=42, n_estimators=22, max_depth=30, n_jobs=-1, class_weight={0: 1, 1: 1.5})
# results = test_preprocess_params(df_tr, rf, params)

# results.to_csv('results_rf.csv', index=False)

In [3]:
df_tr_pre = preprocessing(df_tr, scaling=None, imputation='mode', cat_age=False, target_freq=0.8)
df_te_pre = preprocessing(df_te, scaling=None, imputation='mode', cat_age=False, remove_duplicates=False)

df_tr_pre, df_te_pre = df_tr_pre.align(df_te_pre, join='left', axis=1, fill_value=0)
df_tr_pre.shape, df_te_pre.shape

((43975, 474), (59857, 474))

## Adjusting a random forest model

In [4]:
X_train, y_train = df_tr_pre.drop(target, axis=1), df_tr_pre[target]
X_test, y_test = df_te_pre.drop(target, axis=1), df_te_pre[target]

# find the best parameters for the model
params = {
    'n_estimators': [22, 25, 30, 35, 40],
    'max_depth': [30, 35, 40, 45, 50],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 1.5}, {0: 1, 1: 2}, {0: 1, 1: 2.5}, {0: 1, 1: 3}]
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

grid = GridSearchCV(rf, params, scoring='f1', n_jobs=-1, cv=5)
grid.fit(X_train, y_train)

grid.best_params_

{'class_weight': {0: 1, 1: 3}, 'max_depth': 30, 'n_estimators': 40}