In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Nairobi propertyprices - Sheet1.csv")

# Preview the data
df.head()

Unnamed: 0,Price,propertyType,Location,Bedroom,bathroom,House size,Land size
0,KSh 350 000 000,Townhouse,Runda,4.0,4.0,,0.5 acres
1,KSh 30 000 000,Vacant Land,Karen,,,,0.5 acres
2,KSh 325 000 000,Vacant Land,Westlands,,,,0.5 acres
3,KSh 80 000 000,Townhouse,Kitisuru,5.0,5.0,,0.5 acres
4,KSh 25 500 000,Apartment,Kileleshwa,4.0,4.0,230 m²,


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [3]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [4]:
import seaborn as sns

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x="Location", y="Price_clean")
plt.xticks(rotation=45)
plt.title("Distribution of Property Prices by Location")
plt.ylabel("Price (KSh)")
plt.tight_layout()
plt.show()

ValueError: Could not interpret value `Price_clean` for `y`. An entry with this name does not appear in `data`.

<Figure size 1200x600 with 0 Axes>

In [7]:
# Clean the "Price" column
df["Price_clean"] = (
    df["Price"]
    .str.replace("KSh", "", regex=False)
    .str.replace(",", "")
    .str.replace(" ", "")
    .str.strip()
    .astype(float)
)

# Check if it worked
print(df[["Price", "Price_clean"]].head())

ValueError: could not convert string to float: 'Ksh200000000'

In [8]:
# Standardize and clean the "Price" column
df["Price_clean"] = (
    df["Price"]
    .str.replace(r"[^\d]", "", regex=True)  # Remove non-digit characters
    .astype(float)
)

# Preview the cleaned price
print(df[["Price", "Price_clean"]].head())

             Price  Price_clean
0  KSh 350 000 000  350000000.0
1   KSh 30 000 000   30000000.0
2  KSh 325 000 000  325000000.0
3   KSh 80 000 000   80000000.0
4   KSh 25 500 000   25500000.0


In [9]:
# Remove "acres" and convert to float
df["Land size"] = (
    df["Land size"]
    .str.replace("acres", "", case=False)
    .str.strip()
    .astype(float)
)

In [10]:
# Fill missing values with mean for numeric columns
numeric_cols = ["Bedroom", "bathroom", "House size", "Land size"]
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [11]:
# Clean "House size" column by removing "m²" and converting to float
df["House size"] = (
    df["House size"]
    .str.replace("m²", "", case=False)
    .str.strip()
    .astype(float)
)

ValueError: could not convert string to float: '399 "'

In [12]:
# Clean "House size" by removing all non-numeric characters except dot and converting to float
df["House size"] = (
    df["House size"]
    .astype(str)  # Ensure string type
    .str.replace(r"[^\d.]", "", regex=True)  # Remove everything except numbers and dot
    .replace("", np.nan)  # Replace empty strings with NaN
    .astype(float)
)

In [13]:
# Fill missing values
numeric_cols = ["Bedroom", "bathroom", "House size", "Land size"]
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

In [14]:
# Clean "Land size" by removing "acres" and converting to float
df["Land size"] = (
    df["Land size"]
    .astype(str)
    .str.replace(r"[^\d.]", "", regex=True)  # Keep only numbers and dots
    .replace("", np.nan)
    .astype(float)
)

In [15]:
# Fill missing values in numeric columns
numeric_cols = ["Bedroom", "bathroom", "House size", "Land size"]
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

In [16]:
# Clean "House size" column
df["House size"] = (
    df["House size"]
    .astype(str)
    .str.replace(r"[^\d.]", "", regex=True)  # Remove everything except numbers and dots
    .replace("", np.nan)
    .astype(float)
)

# Clean "Land size" column
df["Land size"] = (
    df["Land size"]
    .astype(str)
    .str.replace(r"[^\d.]", "", regex=True)  # Remove everything except numbers and dots
    .replace("", np.nan)
    .astype(float)
)

In [17]:
# Fill missing values in numeric columns
numeric_cols = ["Bedroom", "bathroom", "House size", "Land size"]
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

In [18]:
# Clean "House size" column
df["House size"] = (
    df["House size"]
    .astype(str)
    .str.replace(r"[^\d.]", "", regex=True)  # Remove everything except numbers and dots
    .replace("", np.nan)
    .astype(float)
)

# Clean "Land size" column
df["Land size"] = (
    df["Land size"]
    .astype(str)
    .str.replace(r"[^\d.]", "", regex=True)  # Remove everything except numbers and dots
    .replace("", np.nan)
    .astype(float)
)

In [19]:
# Apply log transformation to the target (Price_clean)
df["price_log"] = np.log(df["Price_clean"])

# Check the result
df[["Price_clean", "price_log"]].head()

Unnamed: 0,Price_clean,price_log
0,350000000.0,19.673444
1,30000000.0,17.216708
2,325000000.0,19.599336
3,80000000.0,18.197537
4,25500000.0,17.054189


In [20]:
# One-hot encode the 'Location' column
df = pd.get_dummies(df, columns=["Location"], prefix="Location")

# View the updated dataframe
df.head()

Unnamed: 0,Price,propertyType,Bedroom,bathroom,House size,Land size,Price_clean,price_log,Location_Kabete,Location_Karen,...,Location_Ongata Rongai,Location_Parklands,Location_Riverside,Location_Rosslyn,Location_Runda,Location_Syokimau,Location_Thigiri,Location_Thome,Location_Waithaka,Location_Westlands
0,KSh 350 000 000,Townhouse,4.0,4.0,199.171717,0.5,350000000.0,19.673444,False,False,...,False,False,False,False,True,False,False,False,False,False
1,KSh 30 000 000,Vacant Land,3.513661,2.906077,199.171717,0.5,30000000.0,17.216708,False,True,...,False,False,False,False,False,False,False,False,False,False
2,KSh 325 000 000,Vacant Land,3.513661,2.906077,199.171717,0.5,325000000.0,19.599336,False,False,...,False,False,False,False,False,False,False,False,False,True
3,KSh 80 000 000,Townhouse,5.0,5.0,199.171717,0.5,80000000.0,18.197537,False,False,...,False,False,False,False,False,False,False,False,False,False
4,KSh 25 500 000,Apartment,4.0,4.0,230.0,0.880469,25500000.0,17.054189,False,False,...,False,False,False,False,False,False,False,False,False,False


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Input features: numeric + one-hot encoded locations
X = df.drop(columns=["Price", "propertyType", "Price_clean", "price_log"])
X = X.astype(float)  # Ensure all inputs are numeric

# Target
y = df["price_log"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

print("✅ Model training complete.")

✅ Model training complete.


In [22]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ R² Score: {r2:.2f}")

✅ RMSE: 0.54
✅ R² Score: 0.83


In [23]:
import pickle

# Save model
with open("MortgageApp_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model saved successfully!")

✅ Model saved successfully!
