<a href="https://colab.research.google.com/github/LbNaveen/MLDA/blob/main/Program_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Step 1: Load dataset
data = pd.read_csv("Housing[1].csv")  # from Kaggle housing prices dataset
print("First 5 rows:\n", data.head(), "\n")

# Step 2: Identify feature types
num_features = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']
cat_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

print("Numeric features:", num_features)
print("Categorical features:", cat_features)

# Step 3: Define preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, [f for f in num_features if f != 'price']),
    ("cat", cat_pipeline, cat_features),
])
preprocessor.set_output(transform="pandas")

# Step 4: Apply transformations
X = preprocessor.fit_transform(data)
y = data[['price']]

print("Processed features (first 5 rows):\n", X.head(), "\n")
print("Target variable (first 5 rows):\n", y.head(), "\n")

# Step 5: Feature engineering (price per area)
X['price_per_sqft'] = y['price'] / (data['area'] + 1)
print("After feature engineering:\n", X.head(), "\n")

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train sample:\n", X_train.head(), "\n")
print("y_train sample:\n", y_train.head(), "\n")

First 5 rows:
       price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished   

Numeric features: ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']