# Random Forest - Practical Implementation
## Complete hands-on tutorial with real examples

### 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.datasets import load_iris, load_boston
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")

### 2. Random Forest Classification Example

In [None]:
# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

### 3. Feature Importance Visualization

In [None]:
# Get feature importance
importance = rf_classifier.feature_importances_
feature_names = iris.feature_names

# Create DataFrame for better visualization
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature')
plt.title('Random Forest - Feature Importance')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

print(importance_df)

### 4. Random Forest Regression Example

In [None]:
# Create sample regression data
from sklearn.datasets import make_regression

X_reg, y_reg = make_regression(n_samples=1000, n_features=4, noise=0.1, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Create Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_reg, y_train_reg)

# Make predictions
y_pred_reg = rf_regressor.predict(X_test_reg)

# Evaluate
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"Mean Squared Error: {mse:.3f}")
print(f"R² Score: {r2:.3f}")

# Plot predictions vs actual
plt.figure(figsize=(8, 6))
plt.scatter(y_test_reg, y_pred_reg, alpha=0.7)
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Random Forest Regression: Predicted vs Actual')
plt.tight_layout()
plt.show()

### 5. Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}

# Create Random Forest
rf = RandomForestClassifier(random_state=42)

# Grid search
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:")
print(grid_search.best_params_)
print(f"\nBest cross-validation score: {grid_search.best_score_:.3f}")

# Test best model
best_rf = grid_search.best_estimator_
best_pred = best_rf.predict(X_test)
best_accuracy = accuracy_score(y_test, best_pred)
print(f"Test accuracy with best parameters: {best_accuracy:.3f}")

### 6. Comparing Single Tree vs Random Forest

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Single Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

# Compare results
comparison = pd.DataFrame({
    'Model': ['Decision Tree', 'Random Forest'],
    'Accuracy': [dt_accuracy, rf_accuracy]
})

print(comparison)

# Visualize comparison
plt.figure(figsize=(8, 5))
sns.barplot(data=comparison, x='Model', y='Accuracy')
plt.title('Decision Tree vs Random Forest Accuracy')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()

### 7. Practice Exercise
**Try this yourself:**
1. Load a different dataset (wine, digits, etc.)
2. Apply Random Forest
3. Analyze feature importance
4. Compare with other algorithms

In [None]:
# Your code here
from sklearn.datasets import load_wine

# Load wine dataset
wine = load_wine()
# Complete the exercise...