# Import Libraries

In [29]:

# Standard Libraries
import pandas as pd
import numpy as np

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

import os
import json
import joblib

# Data Loading and Initial Exploration

In [30]:
os.chdir("C:/Users/USER/Desktop/GitHub/Nigeria-Housing-Price-Prediction")

In [31]:
# Load datasets
DATA_PATH_NIGERIA = os.getenv("DATA_PATH", default="dataset/nigeria_houses_data.csv")
nigeria_df = pd.read_csv(DATA_PATH_NIGERIA)

In [32]:
nigeria_df.info()
display(nigeria_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24326 entries, 0 to 24325
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bedrooms       24326 non-null  float64
 1   bathrooms      24326 non-null  float64
 2   toilets        24326 non-null  float64
 3   parking_space  24326 non-null  float64
 4   title          24326 non-null  object 
 5   town           24326 non-null  object 
 6   state          24326 non-null  object 
 7   price          24326 non-null  float64
dtypes: float64(5), object(3)
memory usage: 1.5+ MB


Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state,price
0,6.0,5.0,5.0,4.0,Detached Duplex,Mabushi,Abuja,450000000.0
1,4.0,5.0,5.0,4.0,Terraced Duplexes,Katampe,Abuja,800000000.0
2,4.0,5.0,5.0,4.0,Detached Duplex,Lekki,Lagos,120000000.0
3,4.0,4.0,5.0,6.0,Detached Duplex,Ajah,Lagos,40000000.0
4,4.0,4.0,5.0,2.0,Semi Detached Duplex,Lekki,Lagos,75000000.0


In [33]:
nigeria_df.shape

(24326, 8)

The dataset consists of 24,326 entries (rows) and 8 columns (features). This indicates that we have data for 24,326 properties across 8 features, which include both numeric and categorical attributes.

# Initial Observations and Data Understanding

In [34]:
nigeria_df.describe()

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,price
count,24326.0,24326.0,24326.0,24326.0,24326.0
mean,4.338814,4.600798,5.176355,4.041725,301380200.0
std,1.138497,1.163161,1.226253,1.399936,12204030000.0
min,1.0,1.0,1.0,1.0,90000.0
25%,4.0,4.0,5.0,4.0,52000000.0
50%,4.0,5.0,5.0,4.0,85000000.0
75%,5.0,5.0,6.0,4.0,160000000.0
max,9.0,9.0,9.0,9.0,1800000000000.0


**Bedrooms, Bathrooms, Toilets, and Parking Space**

- The mean values indicate that most properties are moderately sized, with an average of 4-5 bedrooms, 4-5 bathrooms, 5 toilets, and 4 parking spaces.
- The maximum values (e.g., 9 bedrooms, 9 bathrooms, etc.) indicate that the dataset contains some large, luxury properties.

**Price**

- The mean price is around ₦301 million.
- The standard deviation is very high (₦12.2 billion), indicating that there is significant variation in property prices.
- The min value is ₦90 million, and the max value is a staggering ₦1.8 trillion, suggesting that there are both very affordable and extremely expensive properties.

In [35]:
nigeria_df.describe(include=['object'])

Unnamed: 0,title,town,state
count,24326,24326,24326
unique,7,189,25
top,Detached Duplex,Lekki,Lagos
freq,13992,10895,18445


**Title**

- There are 7 unique property types in the dataset, with 'Detached Duplex' being the most frequent (13,992 occurrences). This indicates that detached houses are the most common property type.

**Town**

- The dataset contains properties from 189 different towns. The most frequent town is Lekki with 10,895 occurrences, suggesting that Lekki is a popular area for real estate listings in Nigeria.

**State**

- There are 25 unique states, with Lagos being the most frequent (18,445 properties), indicating a strong real estate presence in Lagos.

In [36]:
print("Column Names:", nigeria_df.columns.tolist())

Column Names: ['bedrooms', 'bathrooms', 'toilets', 'parking_space', 'title', 'town', 'state', 'price']


#  Data Cleaning
- Addressing Datatypes
- Missing Values Analysis
- Duplicate Records
- Outliers Detection

## Addressing Datatypes

In [37]:
nigeria_df.dtypes


bedrooms         float64
bathrooms        float64
toilets          float64
parking_space    float64
title             object
town              object
state             object
price            float64
dtype: object

In [38]:
nigeria_df.head()

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state,price
0,6.0,5.0,5.0,4.0,Detached Duplex,Mabushi,Abuja,450000000.0
1,4.0,5.0,5.0,4.0,Terraced Duplexes,Katampe,Abuja,800000000.0
2,4.0,5.0,5.0,4.0,Detached Duplex,Lekki,Lagos,120000000.0
3,4.0,4.0,5.0,6.0,Detached Duplex,Ajah,Lagos,40000000.0
4,4.0,4.0,5.0,2.0,Semi Detached Duplex,Lekki,Lagos,75000000.0


We can see that `bedrooms`, `bathrooms`, `toilets` and `parking_space` are all float datatypes instead of interger. We will need to address that.

In [39]:
# Convert specified columns from float to int
columns_to_convert = ['bedrooms', 'bathrooms', 'toilets', 'parking_space']
nigeria_df[columns_to_convert] = nigeria_df[columns_to_convert].astype('int64')

# Verify the conversion
print(nigeria_df[columns_to_convert].dtypes)



bedrooms         int64
bathrooms        int64
toilets          int64
parking_space    int64
dtype: object


## Missing Values Analysis

In [40]:
nigeria_df.isnull().sum()


bedrooms         0
bathrooms        0
toilets          0
parking_space    0
title            0
town             0
state            0
price            0
dtype: int64

we can see that there are no missing values

## Duplicate Records

In [41]:
nigeria_df.duplicated().sum()


10438

In [42]:
# Keep the first occurrence
nigeria_df = nigeria_df.drop_duplicates()

In [43]:
nigeria_df.shape

(13888, 8)

# Exploratory Data Analysis (EDA)

# Data Preprocessing and Feature Engineering

# Data Splitting

# Model Development and Training

# Model Evaluation

# Model Interpretation and Feature Importance

# Saving and Deploying the Model

# 