In [7]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [8]:
# Load cleaned data
data = pd.read_csv("../../data/processed/data_processed_tree.csv")

# Split data into features (X) and target (y)
X = data.drop("Status", axis=1)
y = data["Status"]

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create a random forest classifier with 100 trees
clf = RandomForestClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    "n_estimators": [10, 50, 100, 200],
}

# Instantiate GridSearchCV
grid_search = GridSearchCV(
    estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, scoring="accuracy"
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
# Train the classifier
clf.fit(X_train, y_train)

# Accuracy on the test set
accuracy = clf.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))