In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris  # Just for demo purposes; use your own dataset

# Load your dataset (use your own data here)
data = load_iris()
X = data.data  # Features
y = data.target  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define smaller hyperparameter space for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100],  # Reduced range
    'max_depth': [None, 10, 20],  # Reduced depth values
    'min_samples_split': [2, 5],  # Fewer split options
    'min_samples_leaf': [1, 2],  # Fewer leaf options
    'bootstrap': [True]  # Only use 'True' to avoid testing 'False' case
}

# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=10, cv=3, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best model from the random search
best_rf = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_rf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Random Forest Accuracy: {accuracy * 100:.2f}%")


Optimized Random Forest Accuracy: 100.00%


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris  # Just for demo purposes; use your own dataset

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Split data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define expanded hyperparameter space
param_dist = {
    'n_estimators': [50, 100, 200],  # More options for trees
    'max_depth': [None, 10, 20, 30],  # Include deeper trees for flexibility
    'min_samples_split': [5, 10],  # Avoid overfitting
    'min_samples_leaf': [2, 5],  # Avoid very small leaves
    'bootstrap': [True]  # Keep bootstrap enabled
}

# Perform RandomizedSearchCV with cv=5 for better generalization
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                                   n_iter=10, cv=5, scoring='accuracy', 
                                   random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV on training data
random_search.fit(X_train, y_train)

# Get the best model from RandomizedSearchCV
best_rf = random_search.best_estimator_

# Make predictions
y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

# Evaluate performance
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Check if there's overfitting
if train_accuracy > test_accuracy + 5:  # 5% gap indicates possible overfitting
    print("⚠️ Warning: Model may be overfitting!")


Training Accuracy: 97.14%
Test Accuracy: 91.11%


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris  # Use your own dataset

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Split data into training (60%) and testing (40%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define refined hyperparameter space
param_dist = {
    'n_estimators': [100, 200, 300],  # More trees for better learning
    'max_depth': [10, 15, 20],  # Limit depth to prevent overfitting
    'min_samples_split': [10, 20],  # Force trees to generalize
    'min_samples_leaf': [3, 5, 10],  # Prevent very small leaves
    'bootstrap': [True]  # Use bootstrap sampling
}

# Perform RandomizedSearchCV with cv=5
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                                   n_iter=10, cv=5, scoring='accuracy', 
                                   random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV on training data
random_search.fit(X_train, y_train)

# Get the best model
best_rf = random_search.best_estimator_

# Make predictions
y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

# Evaluate performance
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Check if overfitting persists
if train_accuracy > test_accuracy + 5:  # 5% gap is a warning sign
    print("⚠️ Warning: Model may still be overfitting!")


Training Accuracy: 96.67%
Test Accuracy: 95.00%


In [6]:
import joblib

# Save the trained model
joblib.dump(best_rf, "random_forest_model.pkl")
print("Model saved as random_forest_model.pkl")



Model saved as random_forest_model.pkl
