In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("data/houses-for-sale.csv")

In [3]:
data.head()

Unnamed: 0,title,location,size,selling price
0,6 Bed Villa with En Suite at Kiambu Road,Kiambu Road,"6bdrm Mansion in Mushroom Estate, Kiambu / Kia...","KSh 130,000,000"
1,4 Bed Townhouse with En Suite at Off Kiambu Road,"Off Kiambu Road, Kiambu Road",4 Bedroom with DSQ for Sale,"KSh 26,950,000"
2,4 Bed Villa with En Suite at Masai Lodge Road,"masai lodge road, Ongata Rongai",New modern villa for sale.,"KSh 16,000,000"
3,5 Bed House with En Suite in Ngong Road,Ngong Road,5 Bedroom Town House For Sale In Kilimani,"KSh 45,000,000"
4,5 Bed Townhouse with En Suite in Lavington,Lavington,Modern 5 Bedroom Townhouse For Sale,"KSh 72,000,000"


In [4]:

data['selling price'] = pd.to_numeric(data['selling price'].str.replace('KSh', '').str.replace(',', ''), errors='coerce')


In [5]:
data.head()

Unnamed: 0,title,location,size,selling price
0,6 Bed Villa with En Suite at Kiambu Road,Kiambu Road,"6bdrm Mansion in Mushroom Estate, Kiambu / Kia...",130000000.0
1,4 Bed Townhouse with En Suite at Off Kiambu Road,"Off Kiambu Road, Kiambu Road",4 Bedroom with DSQ for Sale,26950000.0
2,4 Bed Villa with En Suite at Masai Lodge Road,"masai lodge road, Ongata Rongai",New modern villa for sale.,16000000.0
3,5 Bed House with En Suite in Ngong Road,Ngong Road,5 Bedroom Town House For Sale In Kilimani,45000000.0
4,5 Bed Townhouse with En Suite in Lavington,Lavington,Modern 5 Bedroom Townhouse For Sale,72000000.0


In [6]:
data.dtypes


title             object
location          object
size              object
selling price    float64
dtype: object

In [7]:
data.describe()

Unnamed: 0,selling price
count,1848.0
mean,61015780.0
std,70844520.0
min,2700000.0
25%,16499750.0
50%,39500000.0
75%,80000000.0
max,684000000.0


In [8]:
data.isna().sum()

title             0
location          0
size              0
selling price    28
dtype: int64

In [9]:
mean_selling_price = data['selling price'].mean()
data['selling price'].fillna(mean_selling_price, inplace=True)

In [10]:
data.isna().sum()

title            0
location         0
size             0
selling price    0
dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack
import pandas as pd  # Import pandas

# Example for regression model to predict price
X = data[['title', 'location', 'size']]
y = data['selling price']

# Convert text data to numerical features
title_encoder = TfidfVectorizer()
X_title = title_encoder.fit_transform(X['title'])

# Create a copy of the DataFrame for 'location' and 'size' columns
X_copy = X.copy()

# Use label encoding for 'location' and 'size' columns in the copy
le = LabelEncoder()
X_copy['location'] = le.fit_transform(X_copy['location'])
X_copy['size'] = le.fit_transform(X_copy['size'])

# Combine all features using hstack
X_combined = hstack([X_copy['location'].values.reshape(-1, 1), X_copy['size'].values.reshape(-1, 1), X_title])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train and evaluate a regression model (e.g., Random Forest)
model = RandomForestRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)

print(f'Mean Squared Error: {mse}')

Mean Squared Error: 4819682167634898.0


A **Mean Squared Error (MSE)** of 4.78e+15 is an extremely high value, and it indicates that the model's predictions are far from the actual values. This suggests that the model is not performing well on your data. There could be several causes of this like the model choice, feature engineering, hyperparameter tuning etc etc so understand your data well before building the model. 


Let see if the result changes when we change into a better performing model.  We will choose an ensemble model.

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Example for regression model to predict price
X = data[['title', 'location', 'size']]
y = data['selling price']

# Convert text data to numerical features
title_encoder = TfidfVectorizer()
X_title = title_encoder.fit_transform(X['title'])

# Create copies of the DataFrame for 'location' and 'size' columns
X_copy = X.copy()

# Use label encoding for 'location' and 'size' columns in the copies
le = LabelEncoder()
X_copy['location'] = le.fit_transform(X_copy['location'])
X_copy['size'] = le.fit_transform(X_copy['size'])

# Combine all features using hstack
X_combined = hstack([X_copy['location'].values.reshape(-1, 1), X_copy['size'].values.reshape(-1, 1), X_title])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Create an XGBoost regression model
model = xgb.XGBRegressor()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate the Mean Squared Error
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 4171730408363772.0


In [25]:
import json 

# Serialize the model as a JSON string
model_json = model.get_booster().get_dump(dump_format='json')

# Save the JSON string to a file
with open('models/sample_model.json', 'w') as f:
    json.dump(model_json, f)

In [33]:
import xgboost as xgb

# Load your saved XGBoost model from the JSON file (replace 'your_model_filename.json' with the actual filename)

model = xgb.Booster(model_file='models/sample_model.model')

# Define a function to make predictions (use your existing code)
def predict_selling_price(title, location, size):
    # Convert the user input into numerical features
    title_encoded = title_encoder.transform([title])
    
    try:
        location_encoded = location_encoder.transform([location])
    except ValueError:
        # Handle unseen location labels, e.g., assign a default label
        location_encoded = location_encoder.transform(['Unknown'])
    
    try:
        size_encoded = size_encoder.transform([size])
    except ValueError:
        # Handle unseen size labels, e.g., assign a default label
        size_encoded = size_encoder.transform(['Unknown'])
    
    # Combine the numerical features using hstack
    features = hstack([location_encoded.reshape(1, -1), size_encoded.reshape(1, -1), title_encoded.reshape(1, -1)])
    
    # Predict the selling price
    predicted_price = model.predict(xgb.DMatrix(features))
    
    return predicted_price[0]

# Get user input
title = input("Enter the house title: ")
location = input("Enter the location: ")
size = input("Enter the size: ")

# Predict selling price
predicted_price = predict_selling_price(title, location, size)
print(f"Predicted Selling Price: {predicted_price:.2f} KSh")


XGBoostError: [18:07:16] /workspace/dmlc-core/src/io/local_filesys.cc:209: Check failed: allow_null:  LocalFileSystem::Open "models/sample_model.model": No such file or directory
Stack trace:
  [bt] (0) /home/grayhat/anaconda3/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x1ba0be) [0x7f9f01bba0be]
  [bt] (1) /home/grayhat/anaconda3/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0xcc8dc7) [0x7f9f026c8dc7]
  [bt] (2) /home/grayhat/anaconda3/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0xcb4c5e) [0x7f9f026b4c5e]
  [bt] (3) /home/grayhat/anaconda3/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGBoosterLoadModel+0x18e) [0x7f9f01b6e93e]
  [bt] (4) /home/grayhat/anaconda3/lib/python3.9/lib-dynload/../../libffi.so.7(+0x69dd) [0x7f9f5d0c79dd]
  [bt] (5) /home/grayhat/anaconda3/lib/python3.9/lib-dynload/../../libffi.so.7(+0x6067) [0x7f9f5d0c7067]
  [bt] (6) /home/grayhat/anaconda3/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x140f6) [0x7f9f5d0e10f6]
  [bt] (7) /home/grayhat/anaconda3/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x1073e) [0x7f9f5d0dd73e]
  [bt] (8) /home/grayhat/anaconda3/bin/python(_PyObject_MakeTpCall+0x37f) [0x5569115959ef]

