In [6]:
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import warnings
warnings.filterwarnings("ignore")


In [4]:

# Download the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
data = pd.read_csv(url, header=None)

# Assign column names to the dataset
data.columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

# Preprocess the dataset
le = LabelEncoder()
data_encoded = data.apply(le.fit_transform)

# Split the dataset into features and target
X = data_encoded.drop("income", axis=1)
y = data_encoded["income"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the individual models
tree_model = DecisionTreeClassifier(random_state=42)
forest_model = RandomForestClassifier(random_state=42)
logreg_model = LogisticRegression(random_state=42)

# Create the ensemble model
ensemble_model = VotingClassifier(
    estimators=[("tree", tree_model), ("forest", forest_model), ("logreg", logreg_model)],
    voting="hard"
)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    "tree__max_depth": [None, 10, 20],
    "forest__n_estimators": [50, 100, 200],
    "logreg__C": [1.0, 10.0, 100.0]
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(ensemble_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best ensemble model
best_ensemble_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)

# Make predictions
y_pred = best_ensemble_model.predict(X_test)

# Evaluate the ensemble model
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)


Best Hyperparameters: VotingClassifier(estimators=[('tree',
                              DecisionTreeClassifier(max_depth=10,
                                                     random_state=42)),
                             ('forest',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=42)),
                             ('logreg', LogisticRegression(random_state=42))])
Accuracy: 0.8598188238906802
Confusion Matrix:
 [[4716  226]
 [ 687  884]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.95      0.91      4942
           1       0.80      0.56      0.66      1571

    accuracy                           0.86      6513
   macro avg       0.83      0.76      0.79      6513
weighted avg       0.85      0.86      0.85      6513



In [5]:
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'forest__n_estimators': 200, 'logreg__C': 1.0, 'tree__max_depth': 10}


In [7]:
print(f1_score(y_test, y_pred))

0.6594554270794479
