In [67]:
#Import variables
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [68]:
# Load the CSV dataset into a DataFrame
df = pd.read_csv("C:/Users/User/Desktop/Nairobi House Price Prediction/data/raw_listings.csv")

In [69]:
#checking number of rows and columns
df.shape


(501, 8)

# DATA CLEANING

## Handling Duplicates


In [70]:
#check number of duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")


Number of duplicate rows: 233


In [71]:
# Remove duplicate rows
df = df.drop_duplicates()


In [72]:
#check number of duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")


Number of duplicate rows: 0


## Handle missing values

In [73]:
#checking nulls
df.isnull().sum()


location          0
property_type     0
bedrooms          0
bathrooms         0
size_sqft        20
amenities         3
price_kes         0
listing_date      0
dtype: int64

In [74]:
# Fill size_sqft using median per property type
df['size_sqft'] = df.groupby('property_type')['size_sqft']\
                    .transform(lambda x: x.fillna(x.median()))


In [75]:
# fill amenities
df['amenities'] = df['amenities'].fillna('None')



In [76]:
#checking nulls
df.isnull().sum()


location         0
property_type    0
bedrooms         0
bathrooms        0
size_sqft        0
amenities        0
price_kes        0
listing_date     0
dtype: int64

## Standardize location names

In [77]:
# Strip whitespace & convert to title case
df['location'] = df['location'].str.strip().str.title()

# Optional: unify known aliases
df['location'] = df['location'].replace({
    'Nrb': 'Nairobi',
    'Nrb City': 'Nairobi'
})

In [78]:
#standardize text columns
def standardize_text_fields(df, text_cols=None):
    """
    Standardizes text columns by stripping whitespace
    and converting to title case.
    """
    df = df.copy()

    if text_cols is None:
        text_cols = ['location', 'property_type']

    for col in text_cols:
        df[col] = df[col].str.title().str.strip()

    return df


## Convert size units


In [79]:
# convert 'size_m2' to 'size_sqft' 
if 'size_m2' in df.columns:
    df['size_sqft'] = df['size_m2'] * 10.7639


## Remove extreme outliers

In [80]:
# remove price outliers
df = df[(df['price_kes'] >= 1_000_000) & 
                    (df['price_kes'] <= 500_000_000)]
print(f"After price filtering, 1M-500M: {len(df)} rows remain")

After price filtering, 1M-500M: 267 rows remain


In [81]:
def filter_price_outliers(df, min_price=1_000_000, max_price=500_000_000):
    """
    Removes listings with prices outside a realistic range.
    """
    df = df.copy()
    df = df[(df['price_kes'] >= min_price) & (df['price_kes'] <= max_price)]
    return df


In [82]:
def filter_size_outliers(df, min_size_sqft=200, max_size_sqft=20_000):
    """
    Removes listings with missing or extreme property sizes.
    """
    df = df.copy()
    df = df[df['size_sqft'].notna()]
    df = df[df['size_sqft'] >= min_size_sqft]
    df = df[df['size_sqft'] <= max_size_sqft]
    return df


## Suspicious Size Decimal Errors

In [83]:
def fix_suspicious_sizes(df, min_size_sqft=200, suspicious_price_threshold=10_000_000):
    """
    Fixes likely decimal errors where size is unrealistically small
    for high-priced properties.
    """
    df = df.copy()

    suspicious = (
        (df['size_sqft'] < min_size_sqft) &
        (df['price_kes'] > suspicious_price_threshold)
    )

    df.loc[suspicious, 'size_sqft'] *= 10
    return df


## handle amenities

In [84]:
def handle_amenities(df):
    """
    Fills missing amenities with empty strings.
    """
    df = df.copy()
    df['amenities'] = df['amenities'].fillna('None')
    return df


# FEATURE ENGINEERING

In [85]:
# Price per square foot
df['price_per_sqft'] = df['price_kes'] / df['size_sqft']
print("DONE")

DONE


In [86]:
#Creating amenity score
#Counts amenities each property has by counting commas eg "Parking, Pool, Gym" has 2 commas = 3 amenities.

def count_amenities(amenities_str):
    if pd.isna(amenities_str) or amenities_str == 'None':
        return 0
    return amenities_str.count(',') + 1

df['amenity_score'] = df['amenities'].apply(count_amenities)
print("amenity_score")

amenity_score


In [87]:
# Month from listing date
df['listing_date'] = pd.to_datetime(df['listing_date'])
df['month'] = df['listing_date'].dt.month


## SAVE TO CLEAN_LISTINGS CSV

In [88]:
df.to_csv("C:/Users/User/Desktop/Nairobi House Price Prediction/data/clean_listings.csv", index=False)