In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pickle

Load dataset

In [2]:
housing = pd.read_csv("pakistan_house_price_data.csv")

Filter for Islamabad and House properties

In [3]:
housing = housing[housing['city'] == 'Islamabad']
housing = housing[housing['property_type'] == 'House']

Keep only properties that are for sale


In [4]:
housing = housing[housing['purpose'] == 'For Sale']

Convert price to a smaller unit

In [5]:
housing['price'] = housing['price'].apply(lambda x: x / 100000)

Remove extremely high-priced house

In [6]:
housing = housing[housing['price'] < 4000]

Drop unnecessary columns

In [7]:
housing = housing.drop(housing.columns[[0]], axis=1)
housing = housing.drop(
    ['property_id', 'property_type', 'location_id', 'page_url', 'city', 'latitude',
     'province_name', 'longitude', 'purpose', 'date_added', 'agency', 'agent'],
    axis=1
)

Clean and preprocess 'location' column

In [8]:
housing['location'] = housing['location'].apply(lambda x: x.strip())
location_count = housing['location'].value_counts()
location_count_less_10 = location_count[location_count <= 10]
housing['location'] = housing['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)


Split data into features and target variable

In [9]:
X = housing.drop(columns=["price"])
y = housing["price"]

Train-test split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


Define preprocessing pipeline

In [11]:
preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['location']),  # Encode categorical variable
    (StandardScaler(), ['baths', 'bedrooms', 'Total_Area'])  # Scale numerical variables
)

Train different models

In [12]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(),
    "Ridge Regression": Ridge(),
    "Random Forest": RandomForestRegressor()
}

In [13]:
best_model = None
best_r2 = float('-inf')

for name, model in models.items():
    pipe = make_pipeline(preprocessor, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    score = r2_score(y_test, y_pred)
    print(f"{name} R² Score: {score:.4f}")

Linear Regression R² Score: 0.7665
Lasso Regression R² Score: 0.7573
Ridge Regression R² Score: 0.7671
Random Forest R² Score: 0.8618


Save the best model based on R² score

In [14]:
if score > best_r2:
    best_r2 = score
    best_model = pipe

Save the best-performing model

In [15]:

pickle.dump(best_model, open("isb_house_price_pred.pkl", "wb"))

In [17]:
print("Model training complete. Best model saved as 'isb_house_price_pred.pkl'.")

Model training complete. Best model saved as 'isb_house_price_pred.pkl'.
