<a href="https://colab.research.google.com/github/N1khil-J4dhav/College/blob/main/Experiment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing # Built-in dataset
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer # For handling missing values
from sklearn.preprocessing import StandardScaler, OneHotEncoder # For scaling and encoding
from sklearn.compose import ColumnTransformer # For applying different transforms to different columns
from sklearn.pipeline import Pipeline # To chain the steps

print("Libraries imported successfully.")

# Step 2: Load the dataset (California Housing)
housing = fetch_california_housing(as_frame=True)
df = housing.frame

# The target variable is 'MedHouseVal' (Median House Value)
df['MedHouseVal'] = housing.target

# --- FIX: Manually add the missing 'ocean_proximity' categorical column with dummy data ---
# The standard fetch_california_housing often excludes this feature.
# We add it here to ensure the OneHotEncoder step in the pipeline is demonstrated successfully.
np.random.seed(42)
df['ocean_proximity'] = np.random.choice(['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND'], size=len(df))
# ---------------------------------------------------------------------------------------

# Note: The original dataset is clean, but we will artificially introduce
# some missing values in 'AveBedrms' to demonstrate the Imputer.
np.random.seed(42)
missing_indices = np.random.choice(df.index, size=200, replace=False)
df.loc[missing_indices, 'AveBedrms'] = np.nan

# The 'ocean_proximity' column is categorical, requiring encoding.
print("\n--- Dataset Head (with artificial NaNs in AveBedrms) ---")
print(df.head())
print("---------------------------------------------------------")
print(f"Total rows with missing values before processing: {df.isnull().any(axis=1).sum()}")


# Step 3: Define Features and Target
X = df.drop('MedHouseVal', axis=1) # All features
y = df['MedHouseVal'] # Target

# Step 4: Define Column Types
# We explicitly list the features now that 'ocean_proximity' has been added,
# instead of relying on brittle column indexing.
numeric_features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
categorical_features = ['ocean_proximity']

print(f"\nNumeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")


# Step 5: Create Preprocessing Pipelines for Numeric and Categorical Data

# 5.1 Pipeline for Numeric features (Imputation and Scaling)
# A) Imputer: Fills missing values with the mean of the column
# B) Scaler: Standardizes features (mean=0, variance=1)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# 5.2 Pipeline for Categorical features (One-Hot Encoding)
# One-Hot Encoding turns categories into binary (0 or 1) columns.
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # 'ignore' handles new categories not seen during training
])

# Step 6: Combine all Preprocessing Steps using ColumnTransformer
# This is the "brain" that applies the right transformer to the right column.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep any columns not specified (if any)
)

print("\nColumnTransformer (Preprocessing Pipeline) created.")


# Step 7: Apply the Preprocessor to the Data
# The preprocessor is trained and applied in one step
X_processed = preprocessor.fit_transform(X)


# Step 8: Convert the result back to a DataFrame and Display
# Get feature names after one-hot encoding
# The OneHotEncoder output feature names will include the original feature name as a prefix
cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
feature_names_out = numeric_features + list(cat_feature_names)

X_processed_df = pd.DataFrame(X_processed, columns=feature_names_out)


print("\n--- Preprocessed Data (X_processed_df.head()) ---")
print(X_processed_df.head())
print(f"\nOriginal number of features: {X.shape[1]}")
print(f"New number of features (after One-Hot Encoding): {X_processed_df.shape[1]}")
print(f"Shape of Preprocessed Data: {X_processed_df.shape}")
print("-------------------------------------------------")

print("\nSuccessfully completed data preprocessing (Expt 02).")
print("The data is now scaled, missing values are imputed, and categorical features are encoded.")

# Optional: Split the processed data for training a model (as the next logical step)
X_train_proc, X_test_proc, y_train_proc, y_test_proc = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

print(f"\nProcessed Training Set Shape (for ML): {X_train_proc.shape}")
print(f"Processed Testing Set Shape (for ML): {X_test_proc.shape}")

Libraries imported successfully.

--- Dataset Head (with artificial NaNs in AveBedrms) ---
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal ocean_proximity  
0    -122.23        4.526        NEAR BAY  
1    -122.22        3.585          ISLAND  
2    -122.24        3.521      NEAR OCEAN  
3    -122.25        3.413          ISLAND  
4    -122.25        3.422          ISLAND  
---------------------------------------------------------
Total rows with missing values before processing: 200

Numeric features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Populati