In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [18]:
df = pd.read_csv("melb_data.csv")  # ensure file is in same folder
print("Dataset Shape:", df.shape)
print(df.head())

Dataset Shape: (18396, 22)
   Unnamed: 0      Suburb           Address  Rooms Type      Price Method  \
0           1  Abbotsford      85 Turner St      2    h  1480000.0      S   
1           2  Abbotsford   25 Bloomburg St      2    h  1035000.0      S   
2           4  Abbotsford      5 Charles St      3    h  1465000.0     SP   
3           5  Abbotsford  40 Federation La      3    h   850000.0     PI   
4           6  Abbotsford       55a Park St      4    h  1600000.0     VB   

  SellerG       Date  Distance  ...  Bathroom  Car  Landsize  BuildingArea  \
0  Biggin  3/12/2016       2.5  ...       1.0  1.0     202.0           NaN   
1  Biggin  4/02/2016       2.5  ...       1.0  0.0     156.0          79.0   
2  Biggin  4/03/2017       2.5  ...       2.0  0.0     134.0         150.0   
3  Biggin  4/03/2017       2.5  ...       2.0  1.0      94.0           NaN   
4  Nelson  4/06/2016       2.5  ...       1.0  2.0     120.0         142.0   

   YearBuilt  CouncilArea  Lattitude Long

In [19]:
df = df.drop(columns=["Address", "SellerG", "Date", "CouncilArea", "Regionname", "Propertycount"], errors="ignore")
X = df.drop("Price", axis=1)
y = df["Price"]

num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(include="object").columns.tolist()

print("\nNumerical Columns:", num_cols)
print("Categorical Columns:", cat_cols)



Numerical Columns: ['Unnamed: 0', 'Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
Categorical Columns: ['Suburb', 'Type', 'Method']


In [20]:

num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])



In [21]:
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [22]:
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("\nShape Before Preprocessing:", X_train.shape)
print("Shape After Preprocessing:", X_train_processed.shape)


Shape Before Preprocessing: (14716, 15)
Shape After Preprocessing: (14716, 341)


In [24]:
cat_features = preprocessor.named_transformers_["cat"]["encoder"].get_feature_names_out(cat_cols)
processed_features = np.concatenate([num_cols, cat_features])

X_train_df = pd.DataFrame(X_train_processed.toarray() if hasattr(X_train_processed, "toarray") else X_train_processed,
columns=processed_features)

print("\nPreview of Processed Training Data:")
print(X_train_df.head())


Preview of Processed Training Data:
   Unnamed: 0     Rooms  Distance  Postcode  Bedroom2  Bathroom       Car  \
0   -1.596158  1.105319 -0.120422 -0.052066  0.077237 -0.663154  0.343081   
1   -0.961268  1.105319 -0.961817  0.143419  1.221871  0.860971  1.483275   
2   -1.512130  0.065966  0.572492  0.585832  0.077237 -0.663154  0.343081   
3   -0.727924  1.105319 -0.582364 -0.710540  1.221871 -0.663154  1.483275   
4    0.900362  0.065966  2.073806  0.287460  0.077237  0.860971  0.343081   

   Landsize  BuildingArea  YearBuilt  ...  Suburb_Yallambie  \
0 -0.023816     -0.031128   0.079701  ...               0.0   
1 -0.086369      0.115414   1.344041  ...               0.0   
2  0.012080     -0.031128   0.079701  ...               0.0   
3  0.001258      0.142058  -2.686045  ...               0.0   
4 -0.007188     -0.023134  -0.315406  ...               0.0   

   Suburb_Yarraville  Type_h  Type_t  Type_u  Method_PI  Method_S  Method_SA  \
0                0.0     1.0     0.0     