In [1]:
import pandas as pd
import numpy as np
import re


In [2]:
house_pricing_data = pd.read_csv('../house_data.csv')

# REMOVING THE EMPTY TITLE ROWS
house_pricing_data.dropna(subset=['title'], inplace=True)
house_pricing_data

Unnamed: 0,bedroom,parking_lot,bathroom,toilets,town,state,serviced,extras,Stable Electricity,title,price
0,6.0,No,6.0,7,Ikoyi,Lagos,No,,No,Detached Duplex,"₦250,000,000/year"
1,3.0,No,3.0,4,Ikoyi,Lagos,No,,No,Flat Apartment,"₦40,000,000/year"
2,3.0,No,3.0,4,Ikoyi,Lagos,No,,No,Flat Apartment,"₦20,000,000/year"
3,3.0,No,3.0,4,Ikoyi,Lagos,No,,No,Flat Apartment,"₦20,000,000/year"
4,3.0,No,3.0,4,Ikoyi,Lagos,No,,No,Flat Apartment,"₦20,000,000/year"
...,...,...,...,...,...,...,...,...,...,...,...
12028,1.0,Yes,1.0,1,Oshodi,Lagos,No,8.0,No,mini flat Mini Flat Flat Apartment,"₦800,000/year"
12029,5.0,No,6.0,6,Ikoyi,Lagos,Yes,,No,Terraced Duplex,"₦20,000,000/year"
12030,4.0,No,5.0,5,Ikoyi,Lagos,No,,No,Terraced Duplex,"₦40,000,000/year"
12031,2.0,No,2.0,3,Lekki,Lagos,No,,No,Blocks Of Flats,"₦8,500,000/year"


In [3]:
house_pricing_data.loc[
    house_pricing_data["title"].str.lower().str.count(r"mini flat") == 2, "title"
] = "Mini flat Apartment"


In [4]:
# Fill NaN Values in Extra Column with 0
house_pricing_data["extras"].fillna(value=0, inplace=True)
house_pricing_data.head(40)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  house_pricing_data["extras"].fillna(value=0, inplace=True)


Unnamed: 0,bedroom,parking_lot,bathroom,toilets,town,state,serviced,extras,Stable Electricity,title,price
0,6.0,No,6.0,7,Ikoyi,Lagos,No,0.0,No,Detached Duplex,"₦250,000,000/year"
1,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦40,000,000/year"
2,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦20,000,000/year"
3,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦20,000,000/year"
4,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦20,000,000/year"
5,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦39,999,999/year"
6,2.0,No,2.0,3,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦34,999,999/year"
7,3.0,No,3.0,4,Victoria Island,Lagos,No,0.0,No,Flat Apartment,"$50,000/year"
8,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦39,999,999/year"
9,5.0,No,5.0,6,Ikoyi,Lagos,No,0.0,No,Detached Duplex,"₦249,999,999/year"


In [5]:
# Fill the NaN values of the bedrooms, bathrooms and toilet based on the title

house_pricing_data.loc[
    house_pricing_data["title"].str.lower().str.contains("flat apartment") & 
    house_pricing_data["bedroom"].isna(),
    "bedroom"
] = 2

house_pricing_data.loc[
    house_pricing_data["title"].str.lower().str.contains("mini flat") & 
    house_pricing_data["bedroom"].isna(),
    "bedroom"
] = 1

house_pricing_data.loc[
    house_pricing_data["title"].str.lower().str.contains("flat apartment") & 
    house_pricing_data["bathroom"].isna(),
    "bathroom"
] = 1

house_pricing_data.loc[
    house_pricing_data["title"].str.lower().str.contains("flat apartment") & 
    house_pricing_data["toilets"].isna(),
    "toilets"
] = 2
house_pricing_data.isna().sum()
    

bedroom               183
parking_lot             0
bathroom              193
toilets               184
town                    0
state                   0
serviced                0
extras                  0
Stable Electricity      0
title                   0
price                   0
dtype: int64

In [6]:
house_pricing_data.dropna(inplace=True)
house_pricing_data.shape

(7662, 11)

## Price Manipulation

In [7]:
# WORKING ON PRICE COLUMN
house_pricing_data = house_pricing_data.drop(
     house_pricing_data[
        house_pricing_data["price"].str.contains("/month|/day", na=False)
    ].index
)


In [8]:
mask = house_pricing_data["price"].str.contains(r"\$", na=False)

house_pricing_data.loc[mask, "price"] = (
    house_pricing_data.loc[mask, "price"]
    .str.replace('$', '', regex=False)
    .str.replace(',', '', regex=False)  
    .str.replace('/year', '', regex=False)  
    .astype(float) 
    * 1610  # Convert from USD to NGN
).astype(int)


In [9]:
house_pricing_data

Unnamed: 0,bedroom,parking_lot,bathroom,toilets,town,state,serviced,extras,Stable Electricity,title,price
0,6.0,No,6.0,7,Ikoyi,Lagos,No,0.0,No,Detached Duplex,"₦250,000,000/year"
1,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦40,000,000/year"
2,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦20,000,000/year"
3,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦20,000,000/year"
4,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,"₦20,000,000/year"
...,...,...,...,...,...,...,...,...,...,...,...
12028,1.0,Yes,1.0,1,Oshodi,Lagos,No,8.0,No,Mini flat Apartment,"₦800,000/year"
12029,5.0,No,6.0,6,Ikoyi,Lagos,Yes,0.0,No,Terraced Duplex,"₦20,000,000/year"
12030,4.0,No,5.0,5,Ikoyi,Lagos,No,0.0,No,Terraced Duplex,"₦40,000,000/year"
12031,2.0,No,2.0,3,Lekki,Lagos,No,0.0,No,Blocks Of Flats,"₦8,500,000/year"


In [10]:
mask = house_pricing_data["price"].str.contains(r"\d", na=False)
house_pricing_data = house_pricing_data[mask] 

house_pricing_data["price"] = (
    house_pricing_data["price"]
    .str.replace("₦", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.replace("/year", "", regex=False)
    .astype(float)
)

house_pricing_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_pricing_data["price"] = (


Unnamed: 0,bedroom,parking_lot,bathroom,toilets,town,state,serviced,extras,Stable Electricity,title,price
0,6.0,No,6.0,7,Ikoyi,Lagos,No,0.0,No,Detached Duplex,250000000.0
1,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,40000000.0
2,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,20000000.0
3,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,20000000.0
4,3.0,No,3.0,4,Ikoyi,Lagos,No,0.0,No,Flat Apartment,20000000.0
...,...,...,...,...,...,...,...,...,...,...,...
12028,1.0,Yes,1.0,1,Oshodi,Lagos,No,8.0,No,Mini flat Apartment,800000.0
12029,5.0,No,6.0,6,Ikoyi,Lagos,Yes,0.0,No,Terraced Duplex,20000000.0
12030,4.0,No,5.0,5,Ikoyi,Lagos,No,0.0,No,Terraced Duplex,40000000.0
12031,2.0,No,2.0,3,Lekki,Lagos,No,0.0,No,Blocks Of Flats,8500000.0


## Serviced, Parking_lot, Stable Electricity Column manipulation

In [11]:
binary_map = {'yes': 1, 'no': 0}  # Define a case-insensitive map

house_pricing_data["parking_lot"] = (
    house_pricing_data["parking_lot"]
    .str.strip() 
    .str.lower()
    .map(binary_map)  
)


    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_pricing_data["parking_lot"] = (


In [12]:
house_pricing_data["serviced"] = (
    house_pricing_data["serviced"]
    .str.strip() 
    .str.lower()
    .map(binary_map)  
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_pricing_data["serviced"] = (


In [13]:
house_pricing_data["Stable Electricity"] = (
    house_pricing_data["Stable Electricity"]
    .str.strip() 
    .str.lower()
    .map(binary_map)  
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_pricing_data["Stable Electricity"] = (


In [14]:
house_pricing_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7552 entries, 0 to 12032
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bedroom             7552 non-null   float64
 1   parking_lot         7552 non-null   int64  
 2   bathroom            7552 non-null   float64
 3   toilets             7552 non-null   object 
 4   town                7552 non-null   object 
 5   state               7552 non-null   object 
 6   serviced            7552 non-null   int64  
 7   extras              7552 non-null   float64
 8   Stable Electricity  7552 non-null   int64  
 9   title               7552 non-null   object 
 10  price               7552 non-null   float64
dtypes: float64(4), int64(3), object(4)
memory usage: 708.0+ KB


In [15]:
house_pricing_data["toilets"] = (
    house_pricing_data["toilets"]
    .astype(str)  # Ensure it's a string
    .str.extract("(\d+)")  # Extract only numeric parts
    .astype(float)  # Convert to float first to handle NaNs
    .astype("Int64")  # Convert to integer while keeping NaNs
)

  .str.extract("(\d+)")  # Extract only numeric parts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_pricing_data["toilets"] = (


In [16]:
house_pricing_data.dropna(subset=["toilets"], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_pricing_data.dropna(subset=["toilets"], inplace=True)


In [17]:
house_pricing_data

Unnamed: 0,bedroom,parking_lot,bathroom,toilets,town,state,serviced,extras,Stable Electricity,title,price
0,6.0,0,6.0,7,Ikoyi,Lagos,0,0.0,0,Detached Duplex,250000000.0
1,3.0,0,3.0,4,Ikoyi,Lagos,0,0.0,0,Flat Apartment,40000000.0
2,3.0,0,3.0,4,Ikoyi,Lagos,0,0.0,0,Flat Apartment,20000000.0
3,3.0,0,3.0,4,Ikoyi,Lagos,0,0.0,0,Flat Apartment,20000000.0
4,3.0,0,3.0,4,Ikoyi,Lagos,0,0.0,0,Flat Apartment,20000000.0
...,...,...,...,...,...,...,...,...,...,...,...
12028,1.0,1,1.0,1,Oshodi,Lagos,0,8.0,0,Mini flat Apartment,800000.0
12029,5.0,0,6.0,6,Ikoyi,Lagos,1,0.0,0,Terraced Duplex,20000000.0
12030,4.0,0,5.0,5,Ikoyi,Lagos,0,0.0,0,Terraced Duplex,40000000.0
12031,2.0,0,2.0,3,Lekki,Lagos,0,0.0,0,Blocks Of Flats,8500000.0


In [30]:
house_pricing_data = house_pricing_data[house_pricing_data["price"] < 1*1e9]

## Title Manipulation

In [18]:
house_pricing_data["title"] = house_pricing_data["title"].str.lower()
house_pricing_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_pricing_data["title"] = house_pricing_data["title"].str.lower()


Unnamed: 0,bedroom,parking_lot,bathroom,toilets,town,state,serviced,extras,Stable Electricity,title,price
0,6.0,0,6.0,7,Ikoyi,Lagos,0,0.0,0,detached duplex,250000000.0
1,3.0,0,3.0,4,Ikoyi,Lagos,0,0.0,0,flat apartment,40000000.0
2,3.0,0,3.0,4,Ikoyi,Lagos,0,0.0,0,flat apartment,20000000.0
3,3.0,0,3.0,4,Ikoyi,Lagos,0,0.0,0,flat apartment,20000000.0
4,3.0,0,3.0,4,Ikoyi,Lagos,0,0.0,0,flat apartment,20000000.0
...,...,...,...,...,...,...,...,...,...,...,...
12028,1.0,1,1.0,1,Oshodi,Lagos,0,8.0,0,mini flat apartment,800000.0
12029,5.0,0,6.0,6,Ikoyi,Lagos,1,0.0,0,terraced duplex,20000000.0
12030,4.0,0,5.0,5,Ikoyi,Lagos,0,0.0,0,terraced duplex,40000000.0
12031,2.0,0,2.0,3,Lekki,Lagos,0,0.0,0,blocks of flats,8500000.0


In [40]:
house_pricing_data.title.value_counts()

title
flat apartment                               2132
terraced duplex                              1050
blocks of flats                               898
semi detached duplex                          747
detached duplex                               731
mini flat apartment                           658
mini flat self contain flat apartment         221
massionette                                   154
mini flat shared apartment flat apartment     115
mini flat flat apartment                      104
mini flat studio apartment flat apartment     102
shared apartment flat apartment                84
detached bungalow                              77
penthouse flat apartment                       48
mini flat blocks of flats                      32
studio apartment flat apartment                27
self contain flat apartment                    21
semi detached bungalow                         11
Name: count, dtype: int64

In [37]:
# Delete some title rows (less than 10 value counts)

# Step 1: Get titles that appear more than 10 times
popular_titles = house_pricing_data["title"].value_counts()
popular_titles = popular_titles[popular_titles > 10].index

# Step 2: Filter rows where title is in the popular_titles list
house_pricing_data = house_pricing_data[house_pricing_data["title"].isin(popular_titles)]


In [39]:
house_pricing_data = house_pricing_data[house_pricing_data["title"] != "commercial property"]

In [None]:
h

In [32]:
house_pricing_data.to_csv('../house_data_done.csv')