In [2]:
import pandas as pd

In [3]:
# Load the CSV dataset into a DataFrame
df = pd.read_csv("C:/Users/User/Desktop/Nairobi House Price Prediction/data/raw_listings.csv")

In [7]:
# Dataset Overview
total_listings = len(df)
collection_date = pd.Timestamp.today().strftime('%B %d, %Y')
locations_covered = df['location'].nunique()
property_type_counts = df['property_type'].value_counts().to_dict()

print("\nNairobi House Price Data Dictionary \n")
print("Dataset Overview")
print(f"Total Listings: {total_listings} properties")
print(f"Collection Date: {collection_date}")
print(f"Locations Covered: {locations_covered} Nairobi neighborhoods")
print(f"Property Types: {', '.join([f'{k} ({v})' for k,v in property_type_counts.items()])}\n")


Nairobi House Price Data Dictionary 

Dataset Overview
Total Listings: 501 properties
Collection Date: February 18, 2026
Locations Covered: 18 Nairobi neighborhoods
Property Types: Apartment (455), House (28), Townhouse (18)



In [10]:
# Column Descriptions
print("Column Descriptions")
columns = ['location', 'property_type', 'bedrooms', 'bathrooms', 'size_sqft', 
           'amenities', 'price_kes', 'listing_date']

# Header
print(f"{'Column':<15}{'Type':<10}{'Description':<35}{'Examples':<35}{'Notes'}")
print("-" * 120)

for col in columns:
    if col not in df.columns:
        continue

    col_type = df[col].dtype
    examples = df[col].dropna().unique()[:5]
    missing = df[col].isnull().sum()
    stats_note = ""

    if col_type in ['int64', 'float64']:
        stats_note = f"Missing: {missing}, Range: {df[col].min()} - {df[col].max()}"
        if col == 'price_kes':
            stats_note += f", Mean: {df[col].mean():.2f}, Median: {df[col].median():.2f}"
        if col == 'size_sqft':
            stats_note += ", Outliers possible"
    else:
        stats_note = f"Missing: {missing}, Unique: {df[col].nunique()}"

    # Convert examples to string for printing
    examples_str = ', '.join(map(str, examples))
    
    print(f"{col:<15}{str(col_type):<10}{'':<35}{examples_str:<35}{stats_note}")


Column Descriptions
Column         Type      Description                        Examples                           Notes
------------------------------------------------------------------------------------------------------------------------
location       object                                       Westlands, Kilimani, Kileleshwa, Riverside, Lower KabeteMissing: 0, Unique: 18
property_type  object                                       Apartment, House, Townhouse        Missing: 0, Unique: 3
bedrooms       int64                                        1, 3, 4, 2, 5                      Missing: 0, Range: 1 - 8
bathrooms      int64                                        1, 3, 5, 2, 6                      Missing: 0, Range: 1 - 9
size_sqft      float64                                      699.66, 1668.42, 2271.2, 1291.68, 1130.22Missing: 23, Range: 32.29 - 43099.06, Outliers possible
amenities      object                                       Parking (1), Parking (3), Parking (2), Parkin

In [14]:
# Count duplicates
duplicates = df.duplicated().sum()

# Identify extreme price outliers
min_price_row = df.nsmallest(1, 'price_kes').iloc[0]
max_price_row = df.nlargest(1, 'price_kes').iloc[0]

# Identify extreme size outliers (<100 sqft with multiple bedrooms)
size_outliers = df[(df['size_sqft'] < 100) & (df['bedrooms'] > 1)]
size_outlier_str = ""
if not size_outliers.empty:
    row = size_outliers.iloc[0]
    size_outlier_str = f"{row['size_sqft']} sqft properties with {row['bedrooms']} bedrooms - impossible"

# Count missing values
missing_sizes = df['size_sqft'].isnull().sum()
missing_amenities = df['amenities'].isnull().sum()

# Build summary
summary = (
    "Data Quality Issues Identified\n"
    "Critical Issues\n"
    f"Duplicates: There are {duplicates} duplicate rows\n"
    f"Price outliers: {min_price_row['price_kes']:,} KES property in {min_price_row['location']} - impossibly low\n"
    f"Size outliers: {size_outlier_str}\n"
    f"Missing values: {missing_sizes} missing sizes, {missing_amenities} missing amenities"
)

print(summary)


Data Quality Issues Identified
Critical Issues
Duplicates: There are 233 duplicate rows
Price outliers: 360,000 KES property in Runda - impossibly low
Size outliers: 96.88 sqft properties with 2 bedrooms - impossible
Missing values: 23 missing sizes, 6 missing amenities


In [15]:
# Location Summary
# ----------------------------
print("\nLocation Summary (Top 10 by count):")
location_summary = df.groupby('location').agg(
    count=('price_kes','count'),
    min_price=('price_kes','min'),
    max_price=('price_kes','max')
).sort_values('count', ascending=False).head(10)
print(location_summary)


Location Summary (Top 10 by count):
             count  min_price  max_price
location                                
Westlands      265    5300000   87000000
Kilimani        86    5500000   62500000
Kileleshwa      53    5110000   90000000
Syokimau        17    4800000   12500000
Riverside       14    7200000   28000000
Lavington       12   10000000  220000000
Runda           10     360000  260000000
Parklands        8    6600000   65000000
Kitisuru         7   60000000  350000000
Kiambu Road      7   34500000   85000000


In [16]:
# Property Type Breakdown
# ----------------------------
print("\nProperty Type Breakdown:")
ptype_summary = df.groupby('property_type').agg(
    count=('price_kes','count'),
    avg_price=('price_kes','mean')
)
print(ptype_summary.round(2))


Property Type Breakdown:
               count    avg_price
property_type                    
Apartment        455  15889899.56
House             28  96584285.71
Townhouse         18  80777777.78
