In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import itertools
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from auxiliar_func import *
from plot_func import *

In [5]:
df = pd.read_csv('Census-Income-KDD.csv')
target = 'income_50k'
df_tr, df_te = train_test_split(df, test_size=0.3, random_state=42)

## Finding the best preprocessing parameters

In [6]:
params = {
    'scaling': [None],
    'imputation': ['mode', 'nancat'],
    'cat_age': [False, True],
    'target_freq': [0.8, 0.9, 1.0]
}

rf = RandomForestClassifier(random_state=42, n_estimators=65, max_depth=25, n_jobs=-1, class_weight={0: 1, 1: 2.5})
results = test_preprocess_params(df_tr, rf, params, verbose=0)

results.to_csv('results_rf.csv', index=False)

  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],
  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],
  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],
  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],
  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],
  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],
  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],
  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],
  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],
  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.va

In [3]:
df_tr_pre = preprocessing(df_tr, scaling=None, imputation='mode', cat_age=False, target_freq=0.8)
df_te_pre = preprocessing(df_te, scaling=None, imputation='mode', cat_age=False, remove_duplicates=False)

df_tr_pre, df_te_pre = df_tr_pre.align(df_te_pre, join='left', axis=1, fill_value=0)
df_tr_pre.shape, df_te_pre.shape

((43975, 474), (59857, 474))

## Adjusting a random forest model

In [7]:
X_train, y_train = df_tr_pre.drop(target, axis=1), df_tr_pre[target]
X_test, y_test = df_te_pre.drop(target, axis=1), df_te_pre[target]

# find the best parameters for the model
params = {
    'n_estimators': [35, 40, 45, 50, 55, 60, 65, 70],
    'max_depth': [10, 15, 20, 25, 30, 35],
    'class_weight': [{0: 1, 1: 2}, {0: 1, 1: 2.5}, {0: 1, 1: 3}, {0: 1, 1: 3.5}, {0: 1, 1: 4}]
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid = GridSearchCV(rf, params, scoring='f1', n_jobs=-1, cv=5)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

{'class_weight': {0: 1, 1: 2.5}, 'max_depth': 25, 'n_estimators': 65}
0.7101832908978478


In [8]:
# get f1 score for the test set
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96     56275
           1       0.45      0.72      0.55      3582

    accuracy                           0.93     59857
   macro avg       0.72      0.83      0.76     59857
weighted avg       0.95      0.93      0.94     59857

