In [6]:
# Import necessary libraries
import pandas as pd
import requests

In [7]:
# Load the CSV dataset into a DataFrame
df = pd.read_csv("C:/Users/User/Desktop/Nairobi House Price Prediction/data/raw_listings.csv")

In [9]:
# Display the first 5 rows of the dataset to inspect its structure
df.head(5)

Unnamed: 0,location,property_type,bedrooms,bathrooms,size_sqft,amenities,price_kes,listing_date
0,Westlands,Apartment,1,1,699.66,Parking (1),6500000,2026-02-18
1,Westlands,Apartment,1,1,699.66,Parking (1),6500000,2026-02-18
2,Westlands,Apartment,1,1,699.66,Parking (1),6500000,2026-02-18
3,Westlands,Apartment,3,3,1668.42,Parking (3),21714000,2026-02-18
4,Westlands,Apartment,3,3,1668.42,Parking (3),21714000,2026-02-18


In [10]:
# Display the last 5 rows of the dataset to inspect its structure
df.tail(5)

Unnamed: 0,location,property_type,bedrooms,bathrooms,size_sqft,amenities,price_kes,listing_date
496,Kitisuru,Townhouse,4,5,32.29,Parking (3),60000000,2026-02-18
497,Kilimani,Apartment,3,3,1453.14,Parking (3),14500000,2026-02-18
498,Kiambu Road,House,7,8,9999.76,Parking (6),85000000,2026-02-18
499,Westlands,Apartment,1,1,624.31,Parking (1),8700000,2026-02-18
500,Loresho,Apartment,4,5,43.06,"Parking (3), Security",75000000,2026-02-18


In [11]:
# Data types of each column
print("Column data types:")
print(df.dtypes, "\n")


Column data types:
location          object
property_type     object
bedrooms           int64
bathrooms          int64
size_sqft        float64
amenities         object
price_kes          int64
listing_date      object
dtype: object 



In [23]:
# Create a summary dictionary with key dataset statistics
summary = {
    'total_rows': len(df),                          # Total number of rows
    'total_columns': len(df.columns),              # Total number of columns
    'missing_values': df.isnull().sum().to_dict(), # Missing values per column
    'duplicate_rows': df.duplicated().sum(),       # Number of duplicate rows
    'locations': df['location'].nunique(),         # Number of unique locations
    'property_types': df['property_type'].unique().tolist(),  # Unique property types
    'price_range': [df['price_kes'].min(), df['price_kes'].max()], # Min and max price
    'avg_price': df['price_kes'].mean()           # Average price
}

In [24]:
# Print the summary statistics
for key, value in summary.items():
    print(f"{key}: {value}")

total_rows: 501
total_columns: 8
missing_values: {'location': 0, 'property_type': 0, 'bedrooms': 0, 'bathrooms': 0, 'size_sqft': 23, 'amenities': 6, 'price_kes': 0, 'listing_date': 0}
duplicate_rows: 233
locations: 18
property_types: ['Apartment', 'House', 'Townhouse']
price_range: [np.int64(360000), np.int64(350000000)]
avg_price: 22731066.467065867


In [None]:
#checking nulls
df.isnull().sum()


location          0
property_type     0
bedrooms          0
bathrooms         0
size_sqft        23
amenities         6
price_kes         0
listing_date      0
dtype: int64

In [12]:
# Calculate the percentage of missing values per column
(df.isnull().sum() / len(df)) * 100

location         0.000000
property_type    0.000000
bedrooms         0.000000
bathrooms        0.000000
size_sqft        4.590818
amenities        1.197605
price_kes        0.000000
listing_date     0.000000
dtype: float64

In [13]:
# Show rows where 'size_sqft' is missing
df[df['size_sqft'].isnull()]


Unnamed: 0,location,property_type,bedrooms,bathrooms,size_sqft,amenities,price_kes,listing_date
282,Westlands,Apartment,2,2,,Parking (2),14000000,2026-02-18
283,Westlands,Apartment,1,1,,Parking (1),8700000,2026-02-18
284,Westlands,Apartment,1,1,,Parking (1),6440000,2026-02-18
288,Westlands,Apartment,1,1,,Parking (1),5800000,2026-02-18
307,Westlands,Apartment,2,3,,Parking (2),13000000,2026-02-18
328,Lower Kabete,Apartment,1,1,,Parking (1),4720000,2026-02-18
334,Westlands,Apartment,2,3,,Parking (3),14000000,2026-02-18
335,Westlands,Apartment,1,1,,Parking (1),9000000,2026-02-18
337,Spring Valley,Townhouse,5,6,,Parking (3),65000000,2026-02-18
339,Kitisuru,Townhouse,4,5,,Parking (3),60000000,2026-02-18


In [14]:
# List all columns that contain any missing values
df.columns[df.isnull().any()].tolist()

['size_sqft', 'amenities']

In [16]:
# Count the number of duplicate rows in the dataset
df.duplicated().sum()

np.int64(233)

In [17]:
# Get the top 10 most frequent locations
df['location'].value_counts().head(10)

location
Westlands      265
Kilimani        86
Kileleshwa      53
Syokimau        17
Riverside       14
Lavington       12
Runda           10
Parklands        8
Kitisuru         7
Kiambu Road      7
Name: count, dtype: int64

In [18]:
# Get the 5 least frequent locations
df['location'].value_counts().tail(5)

location
Loresho       3
Muthangari    2
Kyuna         2
Nyari         1
Peponi        1
Name: count, dtype: int64

In [19]:
# View the top 20 most common amenities
df['amenities'].value_counts().head(20)

amenities
Parking (1)              248
Parking (2)              136
Parking (3)               36
Parking (7)               17
Parking (6)               16
Parking (4)               13
Parking (5)               10
Parking (1), Garden       10
Parking (8)                3
Parking (4), Garden        2
Parking (3), Security      2
Parking (5), Security      1
Parking (10)               1
Name: count, dtype: int64

In [20]:
# Inspect properties with suspiciously low prices
df.nsmallest(10, 'price_kes')[['location', 'bedrooms', 'price_kes']]

Unnamed: 0,location,bedrooms,price_kes
457,Runda,4,360000
81,Lower Kabete,1,4310000
94,Lower Kabete,1,4720000
291,Lower Kabete,1,4720000
328,Lower Kabete,1,4720000
473,Lower Kabete,1,4720000
448,Syokimau,1,4800000
18,Kileleshwa,1,5110000
19,Kileleshwa,1,5110000
20,Kileleshwa,1,5110000


In [21]:
# Inspect properties with the highest prices
df.nlargest(10, 'price_kes')[['location', 'bedrooms', 'price_kes']]

Unnamed: 0,location,bedrooms,price_kes
470,Kitisuru,5,350000000
430,Runda,5,260000000
172,Lavington,5,220000000
173,Lavington,5,220000000
383,Nyari,8,180000000
341,Karen,5,170000000
462,Karen,4,140000000
404,Kyuna,5,131000000
406,Kyuna,5,131000000
188,Lavington,5,125000000


In [22]:
# Identify properties with impossible or very small sizes (<100 sqft)
df[df['size_sqft'] < 100][['location', 'bedrooms', 'size_sqft']]

Unnamed: 0,location,bedrooms,size_sqft
122,Westlands,2,96.88
150,Spring Valley,3,43.06
199,Kitisuru,4,32.29
208,Kitisuru,5,53.82
262,Kiambu Road,7,75.35
272,Westlands,2,96.88
332,Runda,4,53.82
341,Karen,5,75.35
354,Karen,4,53.82
383,Nyari,8,86.11


In [25]:
# Count the occurrences of each property type
df['property_type'].value_counts()

# Example output:
# Apartment    452
# House         29
# Townhouse     19

property_type
Apartment    455
House         28
Townhouse     18
Name: count, dtype: int64