# Modelling

In [133]:
import os
import pickle
import pandas as pd

# Define the path to the datasets folder
datasets_folder = 'datasets'

# Define the path to the pickle file
pickle_file_path = os.path.join(datasets_folder, 'houses_final.pkl')

# Load the DataFrame from the pickle file
with open(pickle_file_path, 'rb') as file:
    houses = pickle.load(file)

print('DataFrame loaded from pickle file:')
houses.head()

DataFrame loaded from pickle file:


Unnamed: 0,property_type,price,city,baths,purpose,bedrooms,agency,Area(Marlas),rooms_per_marla
0,Flat,10000000,Islamabad,2,For Sale,2,,4.0,1.0
1,Flat,6900000,Islamabad,3,For Sale,3,,5.6,1.071429
2,House,16500000,Islamabad,6,For Sale,5,,8.0,1.375
3,House,43500000,Islamabad,4,For Sale,4,,40.0,0.2
4,House,7000000,Islamabad,3,For Sale,3,Has agent,8.0,0.75


In [1]:
from sklearn.model_selection import train_test_split





In [135]:
X = houses.drop(['price'],axis=1)
y = houses['price']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.15, random_state=42)  

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [136]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Define numerical and categorical columns
numcols = ['bedrooms', 'baths', 'Area(Marlas)','rooms_per_marla']
catcols = ['property_type', 'city', 'purpose', 'agency']

## Define root mean squared error function
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Function to fit a model and return RMSE for train and validation data
def evaluate_model(model, X_train, y_train, X_val, y_val, numcols, catcols):
    # Define transformers for numerical and categorical columns
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numcols),
            ('cat', categorical_transformer, catcols)])

    # Create the final pipeline with the provided regressor
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Predict on the training data
    train_pred = pipeline.predict(X_train)

    # Predict on the validation data
    val_pred = pipeline.predict(X_val)

    # Calculate RMSE for training and validation sets
    train_rmse = root_mean_squared_error(y_train, train_pred)
    val_rmse = root_mean_squared_error(y_val, val_pred)

    # Print RMSE values
    print(f'Train RMSE: {train_rmse}')
    print(f'Val RMSE: {val_rmse}')
    
    return pipeline

### Testing out some base models before hypertuning the best performing one

In [137]:
# Linear Regression 
lin_reg = LinearRegression()
evaluate_model(lin_reg, X_train, y_train, X_val, y_val, numcols, catcols)

Train RMSE: 12490042.34069833
Val RMSE: 11944722.886404138


In [138]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(random_state=42)
evaluate_model(tree, X_train, y_train, X_val, y_val, numcols, catcols)

Train RMSE: 5035253.1808276465
Val RMSE: 6367126.58678491


In [139]:
#Random forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
evaluate_model(rf, X_train, y_train, X_val, y_val, numcols, catcols)

Train RMSE: 5122668.368292086
Val RMSE: 6101188.112174735


In [140]:
# XGBoost
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=42)

evaluate_model(xgb, X_train, y_train, X_val, y_val, numcols, catcols)


Train RMSE: 5781324.93969044
Val RMSE: 6147868.668257887


In [141]:
houses['price'].describe()  

count    1.684150e+05
mean     1.454978e+07
std      1.775212e+07
min      2.500000e+04
25%      1.750000e+05
50%      8.500000e+06
75%      1.950000e+07
max      6.500000e+07
Name: price, dtype: float64

In [142]:
# xgboost performed the best so we will iterate and tune it further  

final_model = XGBRegressor(n_estimators = 100, max_depth = 7, random_state = 42) 

final_pipeline = evaluate_model(final_model, X_train, y_train, X_val, y_val, numcols, catcols)

Train RMSE: 5603124.082452992
Val RMSE: 6123189.748731816


In [143]:
# final test on test set
evaluate_model(final_model,X_train,y_train,X_test,y_test,numcols,catcols)

Train RMSE: 5603124.082452992
Val RMSE: 5927671.705251386


### Testing on new data

In [144]:
import numpy as np
import pandas as pd

# Define a function to predict house price for a single instance
def predict_single_instance(input_data, model_pipeline):
    
    # Convert the input data (dictionary) into a pandas DataFrame (single row)
    input_df = pd.DataFrame([input_data])

    # Predict using the trained pipeline
    predicted_price = model_pipeline.predict(input_df)
    return predicted_price[0]

# Example user input 
user_input = {
    'bedrooms': 3,
    'baths': 2,
    'Area(Marlas)': 10,
    'rooms_per_marla': 0.3,  
    'total_rooms': 5,
    'property_type': 'House',
    'city': 'Lahore',
    'purpose': 'For Sale',
    'agency': 'Has agent'
}



# Predict price for the single instance
predicted_price = predict_single_instance(user_input, final_pipeline)

# Display the result
print(f"Predicted House Price: {predicted_price:.2f} PKR")


Predicted House Price: 16575392.00 PKR
