In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

import sys
import os
#Dynamically add project root to path so src package is always found
project_root = os.path.abspath("..")  
sys.path.append(project_root)

import src.data_processing as dp

In [2]:
#First we need to load and inspect the data 

#Load the dataset
house_df = dp.load_csv('../data/housing_data_raw.csv')

#Look for missing values 
missing = dp.check_missing(house_df)
missing = missing[missing > 0]
#print(f"missing values:\n{missing}")

In [3]:
#Now that we have identified the missing values it is time to fill them in 

#First is lot frontage and i am fillign it with median because street frontage can vary widely and median is robust to outliers 
house_df['Lot Frontage'] = house_df['Lot Frontage'].fillna(house_df['Lot Frontage'].median()) 
#Alley type also has missing values so we arte just gonna fill with no alley becuase since the material is not listed then it must not have an alley 
house_df['Alley'] = house_df['Alley'].fillna('No Alley')
#We are going to fill type of masonary and veneer and its area with 0 because some houses dont have veneers 
house_df['Mas Vnr Type'] = house_df['Mas Vnr Type'].fillna('No Veneer')
house_df['Mas Vnr Area'] = house_df['Mas Vnr Area'].fillna(0)
#Basement features are missing for some houses because they dont have basements at all , so i am going to fill them with either 0 or 'No Basement'
basement_cat = ['Bsmt Qual','Bsmt Cond','Bsmt Exposure','BsmtFin Type 1','BsmtFin Type 2']
for col in basement_cat:
    house_df[col] = house_df[col].fillna('No Basement')

basement_num = ['BsmtFin SF 1','BsmtFin SF 2','Bsmt Unf SF','Total Bsmt SF','Bsmt Full Bath','Bsmt Half Bath']
for col in basement_num:
    house_df[col] = house_df[col].fillna(0)
#Electrical is missing, filling it in with the most frequent value 
house_df['Electrical'] = house_df['Electrical'].fillna(house_df['Electrical'].mode()[0])
#Eill missing fireplace values wit no fireplace because if its missing assume that the house has no fireplace at all 
house_df['Fireplace Qu'] = house_df['Fireplace Qu'].fillna('No Fireplace') 
#Fill garage missing values eitehr with no garage for categorical and 0 for numeric features, if its missing that means that the house does not have it at all 
garage_cat = ['Garage Type','Garage Finish','Garage Qual','Garage Cond']
for col in garage_cat:
    house_df[col] = house_df[col].fillna('No Garage')

garage_num = ['Garage Yr Blt','Garage Cars','Garage Area']
for col in garage_num:
    house_df[col] = house_df[col].fillna(0)
#Next 3 values i am going to fill with 'missing' becuase i am assuming that the house does not have these values and thats why they are missing 
house_df['Pool QC'] = house_df['Pool QC'].fillna('No Pool')
house_df['Fence'] = house_df['Fence'].fillna('No Fence')
house_df['Misc Feature'] = house_df['Misc Feature'].fillna('None')

#Make sure every value is filled
print(dp.check_missing(house_df).sum())

0


In [4]:
#Outlier detection(iqr method)

#Select only numeric columns for outlier detection
numeric_cols = dp.get_numeric_features(house_df)

#Dictionary to store how many outliers each column has
outlier_summary = {}

#We use the interquartile range method to detect outliers
for col in numeric_cols:
    Q1 = numeric_cols[col].quantile(0.25)
    Q3 = numeric_cols[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    #Count how many values fall outside the IQR bounds
    outliers = house_df[
        (house_df[col] < lower_bound) |
        (house_df[col] > upper_bound)
    ]
    outlier_summary[col] = outliers.shape[0]

#Print how many outliers were detected per numeric column
print("outliers detected per numeric column:")
print(pd.Series(outlier_summary).sort_values(ascending=False))

outliers detected per numeric column:
Enclosed Porch     459
BsmtFin SF 2       351
Lot Frontage       261
Screen Porch       256
Overall Cond       252
MS SubClass        208
Mas Vnr Area       203
Bsmt Half Bath     175
Garage Yr Blt      160
Open Porch SF      159
SalePrice          137
Kitchen AbvGr      134
Lot Area           127
Total Bsmt SF      124
Misc Val           103
Bedroom AbvGr       78
Gr Liv Area         75
Wood Deck SF        67
Bsmt Unf SF         56
TotRms AbvGrd       51
1st Flr SF          43
Garage Area         42
Low Qual Fin SF     40
3Ssn Porch          37
Garage Cars         17
BsmtFin SF 1        15
Fireplaces          13
Pool Area           13
Year Built           9
2nd Flr SF           8
Overall Qual         4
Full Bath            4
Bsmt Full Bath       2
Order                0
PID                  0
Half Bath            0
Year Remod/Add       0
Mo Sold              0
Yr Sold              0
dtype: int64


In [5]:
#Feature engineering

#Create a new feature that represents the total house area
#This combines basement area and above ground living area
house_df["Total House Area"] = (
    house_df["Total Bsmt SF"] + house_df["Gr Liv Area"]
)

#Create binary indicator features to capture presence or absence of structures
house_df["Has Garage"] = np.where(house_df["Garage Area"] > 0, 1, 0)
house_df["Has Basement"] = np.where(house_df["Total Bsmt SF"] > 0, 1, 0)
house_df["Has Fireplace"] = np.where(
    house_df["Fireplace Qu"] != "No Fireplace", 1, 0
)

In [6]:
#Target variable transformation

#Sale price is highly skewed, so we apply a log transformation
#This helps linear models perform better
house_df["SalePrice_log"] = np.log1p(house_df["SalePrice"])

#now that all cleaning steps are completed we need to store the cleaned dataframe
#This allows us to reuse the data later without repeating the cleaning process

dp.save_csv(house_df, '../data/housing_data_cleaned.csv')

#Print confirmation so we know the file was saved successfully
print("Cleaned dataset saved as housing_data_cleaned.csv")

Cleaned dataset saved as housing_data_cleaned.csv
