In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler


In [None]:
# Load the dataset
df = pd.read_csv("housing_prices.csv")

In [None]:
# Handle missing values
num_imputer = SimpleImputer(strategy="median")
df[["rooms", "bathrooms", "price"]] = num_imputer.fit_transform(df[["rooms", "bathrooms", "price"]])

In [None]:
# Encode categorical features
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_categories = encoder.fit_transform(df[["location"]])
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(["location"]))
df = pd.concat([df.drop(columns=["location"]), encoded_df], axis=1)

In [None]:
# Normalize numerical features using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 100))
numerical_features = ["area", "rooms", "bathrooms", "price"]
df[numerical_features] = scaler.fit_transform(df[numerical_features])


In [None]:
# Feature Engineering: Creating new features
df["price_per_sqft"] = (df["price"] / df["area"]).fillna(0).replace([np.inf, -np.inf], 0)  # Handle division by zero
df["total_rooms"] = df["rooms"] + df["bathrooms"]

In [None]:
# Replace NaN and infinite values before converting to integer
df = df.fillna(0).replace([np.inf, -np.inf], 0)

In [None]:
# Convert to integer values for better readability
df = df.round(0).astype(int)

In [None]:
# Save the processed dataset
df.to_csv("processed_housing_prices.csv", index=False)

In [None]:
# Display first few rows of the processed data
print(df.head())

   area  rooms  bathrooms  price  location_Downtown  location_Rural  \
0    37     50         50     46                  0               1   
1    88     50         50     30                  1               0   
2    63      0          0     23                  0               1   
3    68     75        100     13                  0               1   
4    89      0        100     46                  1               0   

   location_Suburb  price_per_sqft  total_rooms  
0                0               1          100  
1                0               0          100  
2                0               0            0  
3                0               0          175  
4                0               1          100  
