<a href="https://colab.research.google.com/github/N1khil-J4dhav/College/blob/main/Experiment11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor # The key algorithm for this experiment
from sklearn.metrics import mean_squared_error, r2_score

print("Libraries imported successfully, including RandomForestRegressor.")

# Step 2: Load the dataset (California Housing)
housing = fetch_california_housing(as_frame=True)
df = housing.frame
df['MedHouseVal'] = housing.target

# --- Prepare data for Preprocessing Pipeline ---
# Manually add the categorical column and introduce NaNs for demonstration purposes,
# replicating the setup from the previous experiment.
np.random.seed(42)
df['ocean_proximity'] = np.random.choice(['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND'], size=len(df))
missing_indices = np.random.choice(df.index, size=200, replace=False)
df.loc[missing_indices, 'AveBedrms'] = np.nan

print("\nDataset loaded and prepared with dummy categorical features and NaNs.")

# Step 3: Define Features and Target
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Step 4: Define Column Types
numeric_features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
categorical_features = ['ocean_proximity']

# Step 5: Create Preprocessing Pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Step 6: Combine all Preprocessing Steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# Step 7: Create the Full ML Pipeline (Preprocessing + Model)
# This is the "Bagging Ensemble" step: the Random Forest Regressor
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=100,      # Number of decision trees (base estimators)
        max_features='sqrt',   # Number of features to consider when looking for the best split
        random_state=42,
        n_jobs=-1              # Use all available cores for faster training
    ))
])

print("\nFull Random Forest Pipeline created.")

# Step 8: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Data split into Training ({X_train.shape[0]} samples) and Test ({X_test.shape[0]} samples).")

# Step 9: Train the Random Forest Model
# The pipeline handles both preprocessing (fit_transform) and model training (fit) automatically
print("\nStarting model training...")
rf_model.fit(X_train, y_train)
print("Model training complete.")

# Step 10: Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Step 11: Evaluate the Model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'\n--- Model Evaluation (Random Forest) ---')
print(f'Bagging Ensemble Technique: Random Forest Regressor')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
print(f'R-squared (Variance Explained): {r2:.4f}')
print(f'------------------------------------------')

Libraries imported successfully, including RandomForestRegressor.

Dataset loaded and prepared with dummy categorical features and NaNs.

Full Random Forest Pipeline created.
Data split into Training (16512 samples) and Test (4128 samples).

Starting model training...
Model training complete.

--- Model Evaluation (Random Forest) ---
Bagging Ensemble Technique: Random Forest Regressor
Mean Squared Error (MSE): 0.2570
Root Mean Squared Error (RMSE): 0.5069
R-squared (Variance Explained): 0.8039
------------------------------------------
