## 1. Loading Raw Data

In [1]:
import pandas as pd
import requests

In [2]:
df = pd.read_csv("D:/Kevin Osioma/data/raw_listings_data.csv")

In [3]:
df.head()

Unnamed: 0,location,property_type,bedrooms,bathrooms,size_sqft,amenities,price_kes,listing_date,price_per_sqft,amenity_score,month
0,Westlands,Apartment,1,1,721.19,Parking (1),9000000,2026-02-20,12479.374367,1,2
1,Westlands,Apartment,1,1,1173.28,Parking (1),13800000,2026-02-20,11761.898268,1,2
2,Westlands,Apartment,1,1,807.3,Parking (1),7500000,2026-02-20,9290.226682,1,2
3,Westlands,Apartment,2,2,1173.28,Parking (1),13800000,2026-02-20,11761.898268,1,2
4,Westlands,Apartment,1,1,775.01,Parking (1),7500000,2026-02-20,9677.294487,1,2


In [4]:
df.isnull().sum()

location          0
property_type     0
bedrooms          0
bathrooms         0
size_sqft         0
amenities         0
price_kes         0
listing_date      0
price_per_sqft    0
amenity_score     0
month             0
dtype: int64

In [5]:
(df.isnull().sum() / len(df)) * 100

location          0.0
property_type     0.0
bedrooms          0.0
bathrooms         0.0
size_sqft         0.0
amenities         0.0
price_kes         0.0
listing_date      0.0
price_per_sqft    0.0
amenity_score     0.0
month             0.0
dtype: float64

In [6]:
df[df['size_sqft'].isnull()]

Unnamed: 0,location,property_type,bedrooms,bathrooms,size_sqft,amenities,price_kes,listing_date,price_per_sqft,amenity_score,month


In [7]:
df.columns[df.isnull().any()].tolist()

[]

In [8]:
df.duplicated().sum()

0

In [9]:
df['location'].value_counts().head(10)

location
Westlands      291
Kilimani       115
Kileleshwa      92
Syokimau        43
Lavington       34
Runda           32
Riverside       20
Kiambu Road     17
Upper Hill      14
Karen           13
Name: count, dtype: int64

In [10]:
df['location'].value_counts().tail(5)

location
Muthaiga North    1
Ridgeways         1
Valley Arcade     1
Donholm           1
Garden Estate     1
Name: count, dtype: int64

In [11]:
df['amenities'].value_counts().head(20)

amenities
Parking (1)              258
Parking (2)              256
Parking (3)               99
Parking (4)               44
Parking (6)               24
Parking (7)               14
Parking (5)               13
Parking (1), Garden       11
Parking (3), Security      6
Parking (2), Security      5
Parking (4), Security      5
Parking (2), Garden        4
Parking (10)               3
Security                   3
Parking (8)                2
Parking (5), Security      2
Parking (4), Garden        2
Parking (7), Pool          1
Parking (6), Pool          1
Parking (4), Pool          1
Name: count, dtype: int64

In [12]:
# suspiciously cheap properties

df.nsmallest(10, 'price_kes')[['location', 'bedrooms', 'price_kes']]


Unnamed: 0,location,bedrooms,price_kes
298,Lower Kabete,1,2810000
584,Lower Kabete,1,2960000
353,Kileleshwa,1,3450000
739,Westlands,2,3600000
506,Kileleshwa,1,3700000
637,Lavington,1,4000000
646,Kilimani,1,4000000
664,Kileleshwa,1,4000000
667,Kileleshwa,1,4200000
505,Kileleshwa,1,4250000


In [13]:
df.nlargest(10, 'price_kes')[['location', 'bedrooms', 'price_kes']]

Unnamed: 0,location,bedrooms,price_kes
404,Muthaiga,5,400000000
470,Muthaiga,4,400000000
260,Kitisuru,5,350000000
307,Runda,5,350000000
436,Runda,4,350000000
553,Runda,4,350000000
249,Runda,5,260000000
391,Runda,5,260000000
482,Runda,5,260000000
296,Karen,9,250000000


In [14]:
# impossible sizes 
df[df['size_sqft'] < 100][['location', 'bedrooms', 'size_sqft']]

Unnamed: 0,location,bedrooms,size_sqft


In [15]:
#summary dictionary
summary = {
    'total_rows': len(df),
    'total_columns': len(df.columns),
    'missing_values': df.isnull().sum().to_dict(),
    'duplicate_rows': df.duplicated().sum(),
    'locations': df['location'].nunique(),
    'property_types': df['property_type'].unique().tolist(),
    'price_range': [df['price_kes'].min(), df['price_kes'].max()],
    'avg_price': df['price_kes'].mean()
}

for key, value in summary.items():
    print(f"{key}: {value}")

total_rows: 759
total_columns: 11
missing_values: {'location': 0, 'property_type': 0, 'bedrooms': 0, 'bathrooms': 0, 'size_sqft': 0, 'amenities': 0, 'price_kes': 0, 'listing_date': 0, 'price_per_sqft': 0, 'amenity_score': 0, 'month': 0}
duplicate_rows: 0
locations: 30
property_types: ['Apartment', 'House', 'Townhouse']
price_range: [2810000, 400000000]
avg_price: 35040952.566534914


In [16]:
df['property_type'].value_counts()

property_type
Apartment    604
House         83
Townhouse     72
Name: count, dtype: int64