# 1. Decision Tree Classifier

In [4]:
########################
### Import libraries ###
########################

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import time
import warnings

warnings.filterwarnings('ignore')
start_model = time.time()


#####################
### Load the data ###
#####################

# Set the columns based on "adult.names" file
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
           "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

train_data = pd.read_csv('adult.data', names=columns, sep=r',\s+', na_values=["?"])
test_data = pd.read_csv('adult.test', names=columns, sep=r',\s+', na_values=["?"], skiprows=1)


###########################
### Preprocess the data ###
###########################

# Drop NA values
train_data = train_data.dropna()
test_data = test_data.dropna()

# Remove period in the income column in the test set
test_data['income'] = test_data['income'].str.replace('.', '', regex=False)

# Make a new dictionary for label_encoders
label_encoders = {}

# Convert categorical values into numeric ones using LabelEncoder()
for column in train_data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    train_data[column] = label_encoders[column].fit_transform(train_data[column])
    test_data[column] = label_encoders[column].transform(test_data[column])
    
# Set X_train, X_test, y_train and y_test
X_train = train_data.drop("income", axis=1)
y_train = train_data["income"]

X_test = test_data.drop("income", axis=1)
y_test = test_data["income"]

def decision_tree_classifier(X_train, y_train, X_test, y_test):
    from sklearn.metrics import confusion_matrix
    
    ################################
    ### Select a model and train ###
    ################################
    
    start_train = time.time()
    
    # Set up the RandomForestClassifier
    clf = DecisionTreeClassifier(random_state=42)

    # Define the parameters for the grid
    param_grid = {
        'max_depth': [i for i in range(5)]
    }

    # GridSearchCV
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', verbose=2, n_jobs=-1)

    # Fit the model with GridSearchCV
    grid_search.fit(X_train, y_train)
    end_train = time.time()
    print(f"It took {round(end_train - start_train,3)} seconds for the training")
    
    # Get the best parameters and best score from GridSearchCV

    best_max_depth = grid_search.best_params_['max_depth']
    best_score = grid_search.best_score_

    # Train the final DecisionTreeClassifier with the best max_depth
    best_clf = DecisionTreeClassifier(max_depth = best_max_depth, random_state=42)
    best_clf.fit(X_train, y_train)

    # Print the results
    print(f"Best max_depth: {best_max_depth}")
    print(f"Best Cross-Validation Score: {round(best_score,3)}")
      

    ##################
    ### Prediction ###
    ##################

    # Predictt
    predictions = best_clf.predict(X_test)


    #######################
    ### Measure metrics ###
    #######################

    # Evaluate and print
    accuracy = accuracy_score(y_test, predictions)
    clf_report = classification_report(y_test, predictions)
    confusion_matrix = confusion_matrix(y_test, predictions, labels=[0,1])

    print(f"Decision Tree Classifier Accuracy: {round(accuracy,3)}\n")
    print(f"Classification Report\n{clf_report}\n")
    print(f"Confusion Matrix\n{confusion_matrix}\n")


    ###########################
    ### Feature importances ###
    ###########################

    # Feature importances
    importances = best_clf.feature_importances_
    feature_importance_list = sorted(zip(X_train.columns, importances), key=lambda x: x[1], reverse=True)
    print("Feature Importances:")
    for feature, importance in feature_importance_list:
        print(f"{feature}: {round(importance,3)}")

    end_model = time.time()
    print(f"\nIt took {round(end_model - start_model,3)} seconds for the entire model.")

decision_tree_classifier(X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
It took 0.342 seconds for the training
Best max_depth: 4
Best Cross-Validation Score: 0.842
Decision Tree Classifier Accuracy: 0.839

Classification Report
              precision    recall  f1-score   support

           0       0.86      0.95      0.90     11360
           1       0.75      0.51      0.61      3700

    accuracy                           0.84     15060
   macro avg       0.81      0.73      0.75     15060
weighted avg       0.83      0.84      0.83     15060


Confusion Matrix
[[10746   614]
 [ 1813  1887]]

Feature Importances:
relationship: 0.491
capital-gain: 0.248
education-num: 0.244
capital-loss: 0.014
age: 0.001
fnlwgt: 0.001
workclass: 0.0
education: 0.0
marital-status: 0.0
occupation: 0.0
race: 0.0
sex: 0.0
hours-per-week: 0.0
native-country: 0.0

It took 0.867 seconds for the entire model.


# 2. Random Forest Classifier

In [5]:
########################
### Import libraries ###
########################

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import time
import warnings

warnings.filterwarnings('ignore')
start_model = time.time()

#####################
### Load the data ###
#####################

# Set the columns based on "adult.names" file
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
           "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

train_data = pd.read_csv('adult.data', names=columns, sep=r',\s+', na_values=["?"])
test_data = pd.read_csv('adult.test', names=columns, sep=r',\s+', na_values=["?"], skiprows=1)


###########################
### Preprocess the data ###
###########################

# Drop NA values
train_data = train_data.dropna()
test_data = test_data.dropna()

# Remove period in the income column in the test set
test_data['income'] = test_data['income'].str.replace('.', '', regex=False)

# Make a new dictionary for label_encoders
label_encoders = {}

# Convert categorical values into numeric ones using LabelEncoder()
for column in train_data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    train_data[column] = label_encoders[column].fit_transform(train_data[column])
    test_data[column] = label_encoders[column].transform(test_data[column])
    
# Set X_train, X_test, y_train and y_test
X_train = train_data.drop("income", axis=1)
y_train = train_data["income"]

X_test = test_data.drop("income", axis=1)
y_test = test_data["income"]


def random_forest_classifier(X_train, y_train, X_test, y_test):  
    warnings.filterwarnings('ignore')
    from sklearn.metrics import confusion_matrix
  
    ################################
    ### Select a model and train ###
    ################################
    
    start_train = time.time()
    # Set up the RandomForestClassifier
    clf = RandomForestClassifier(random_state=42)

    # Define the parameter grid (list of n_estimators values to try)
    param_grid = {
        'n_estimators': [i for i in range(0,500,50)],  
        'max_depth': [i for i in range(0,5)]
    }

    # Set up the GridSearchCV
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', verbose=2, n_jobs=-1)

    # Fit the model with GridSearchCV to find the best n_estimators
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best score from GridSearchCV
    best_n_estimators = grid_search.best_params_['n_estimators']
    best_max_depth = grid_search.best_params_['max_depth']
    best_score = grid_search.best_score_

    # Train the new RandomForestClassifier with the best_n_estimators and the best_max_depth
    best_clf = RandomForestClassifier(n_estimators=best_n_estimators, max_depth = best_max_depth, random_state=42)
    best_clf.fit(X_train, y_train)
    
    end_train = time.time()

    print(f"It took {round(end_train - start_train,3)} seconds for the training")
    
    
    ##################
    ### Prediction ###
    ##################
    
    predictions = best_clf.predict(X_test)
    
    
    #######################
    ### Measure metrics ###
    #######################

    # Evaluate and print
    accuracy = accuracy_score(y_test, predictions)
    clf_report = classification_report(y_test, predictions)
    confusion_matrix = confusion_matrix(y_test, predictions, labels=[0,1])
        
    print(f"Best n_estimators: {best_n_estimators}")
    print(f"Best max_depth: {best_max_depth}")
    print(f"Best Cross-Validation Score: {round(best_score,3)}")
    print(f"Random Forest Classifier Accuracy: {round(accuracy,3)}\n")
    print(f"Classification Report\n{clf_report}\n")
    print(f"Confusion Matrix\n{confusion_matrix}\n")


    ###########################
    ### Feature importances ###
    ###########################

    # Feature importances
    importances = best_clf.feature_importances_
    feature_importance_list = sorted(zip(X_train.columns, importances), key=lambda x: x[1], reverse=True)
    print("Feature Importances:")
    for feature, importance in feature_importance_list:
        print(f"{feature}: {round(importance,3)}")

    end_model = time.time()
    print(f"\nIt took {round(end_model - start_model,3)} seconds for the entire model.")


random_forest_classifier(X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
It took 44.397 seconds for the training
Best n_estimators: 350
Best max_depth: 4
Best Cross-Validation Score: 0.842
Random Forest Classifier Accuracy: 0.84

Classification Report
              precision    recall  f1-score   support

           0       0.85      0.96      0.90     11360
           1       0.80      0.47      0.59      3700

    accuracy                           0.84     15060
   macro avg       0.82      0.72      0.75     15060
weighted avg       0.84      0.84      0.82     15060


Confusion Matrix
[[10916   444]
 [ 1959  1741]]

Feature Importances:
relationship: 0.262
capital-gain: 0.225
marital-status: 0.161
education-num: 0.15
age: 0.065
hours-per-week: 0.041
capital-loss: 0.033
sex: 0.025
education: 0.023
occupation: 0.011
workclass: 0.002
race: 0.001
fnlwgt: 0.0
native-country: 0.0

It took 45.012 seconds for the entire model.
