Importing all dependencies

In [2]:
#Import all dependencies which are libraries in this case that are necessary for the execution of this project
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
warnings.filterwarnings("ignore")




Data Loading

In [3]:
#Load the data
data = pd.read_csv('nigeria_housing.csv')

In [4]:
# Convert the csv data into a dataframe",
df = pd.DataFrame(data)

Exploratory Data Analysis (EDA)

In [5]:
#View the data
df.head(10)

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state,price
0,6.0,5.0,5.0,4.0,Detached Duplex,Mabushi,Abuja,450000000.0
1,4.0,5.0,5.0,4.0,Terraced Duplexes,Katampe,Abuja,800000000.0
2,4.0,5.0,5.0,4.0,Detached Duplex,Lekki,Lagos,120000000.0
3,4.0,4.0,5.0,6.0,Detached Duplex,Ajah,Lagos,40000000.0
4,4.0,4.0,5.0,2.0,Semi Detached Duplex,Lekki,Lagos,75000000.0
5,5.0,5.0,6.0,1.0,Detached Duplex,Lekki,Lagos,450000000.0
6,4.0,5.0,5.0,4.0,Detached Duplex,Lekki,Lagos,65000000.0
7,2.0,2.0,3.0,6.0,Detached Bungalow,Epe,Lagos,12000000.0
8,1.0,1.0,1.0,1.0,Detached Duplex,Lekki,Lagos,200000000.0
9,4.0,4.0,5.0,5.0,Detached Duplex,Ajah,Lagos,60000000.0


In [6]:
#Checking for potential outliers in the dataset
print("Using a sample DataFrame with potential outliers.")


# Check if 'price' and 'area' columns exist in the DataFrame
if 'price' in df.columns and 'area' in df.columns:
    plt.figure(figsize=(10, 7)) # Adjust figure size for better visibility

    # Create the scatter plot
    sns.scatterplot(x='area', y='price', data=df)

    plt.title('House Price vs. Area Scatter Plot (Outlier Detection)', fontsize=16)
    plt.xlabel('Area (e.g., sqft)', fontsize=12)
    plt.ylabel('Price (e.g., USD)', fontsize=12)
    plt.grid(True) # Add a grid for easier reading
    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    plt.show()

    print("\n--- Interpretation Tip ---")
    print("Look for points that fall far away from the main cluster of data points.")
    print("These could be:")
    print("1. Points with extremely high or low 'area' values.")
    print("2. Points with extremely high or low 'price' values.")
    print("3. Points that deviate significantly from the general trend (e.g., a small area with a very high price, or a large area with a very low price).")
    print("These unusual points are your potential outliers.")

else:
    print("\nError: 'price' and/or 'area' columns not found in your dataset.")
    print("Please ensure your DataFrame contains these columns with the exact names.")

Using a sample DataFrame with potential outliers.

Error: 'price' and/or 'area' columns not found in your dataset.
Please ensure your DataFrame contains these columns with the exact names.


In [7]:
df.describe()

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,price
count,24326.0,24326.0,24326.0,24326.0,24326.0
mean,4.338814,4.600798,5.176355,4.041725,301380200.0
std,1.138497,1.163161,1.226253,1.399936,12204030000.0
min,1.0,1.0,1.0,1.0,90000.0
25%,4.0,4.0,5.0,4.0,52000000.0
50%,4.0,5.0,5.0,4.0,85000000.0
75%,5.0,5.0,6.0,4.0,160000000.0
max,9.0,9.0,9.0,9.0,1800000000000.0


Data Preprocesssing

In [37]:


#Applying label encoding to the categorical columns
yes_no_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

for col in yes_no_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

#Encode furnishing status
furnishing_map = {
    'unfurnished': 0,
    'semi-furnished': 1,
    'furnished': 2
}
df['furnishingstatus'] = df['furnishingstatus'].map(furnishing_map)


In [38]:
#Checking for missing values
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [39]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

Model Training

In [40]:
#Defining the features and target variable
features = [
    'area', 'bedrooms', 'bathrooms', 'stories',
    'mainroad', 'guestroom', 'basement',
    'hotwaterheating', 'airconditioning',
    'parking', 'prefarea', 'furnishingstatus'
]

target = 'price'

In [41]:
#Split the data into features (X) and target variable (y), the independent and dependent variables"
X = df[features]
y = df[target]

In [42]:
#Scale numeric features
numeric_cols = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [43]:
#Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
#Define models to compare
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

In [48]:


# 9. Train, Predict, and Evaluate
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    
    results.append({
        "Model": name,
        "RMSE": round(rmse, 2),
        "R² Score": round(r2, 4)
    })


In [49]:
# 10. Display comparison
results_df = pd.DataFrame(results).sort_values(by="R² Score", ascending=False)
print(results_df)

               Model        RMSE  R² Score
5  Gradient Boosting  1300896.13    0.6652
0  Linear Regression  1331071.42    0.6495
2   Lasso Regression  1331071.56    0.6495
1   Ridge Regression  1332745.72    0.6486
4      Random Forest  1401369.34    0.6115
3      Decision Tree  1715038.20    0.4181


In [55]:
# Re-train on full data
final_model = GradientBoostingRegressor(random_state=42)
final_model.fit(X, y)

# Save model
joblib.dump(final_model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')



['scaler.pkl']