In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import pandas as pd

# ==========================================
# 1. LOAD DATA (Simulating Data Ingestion)
# ==========================================
# BA Note: Loading raw export from legacy system.
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Melbourne_housing_FULL.csv')

print(f"Initial Row Count: {len(df)}")

# ==========================================
# 2. DATA CLEANING (Requirement NFR-01: Data Quality)
# ==========================================

# Rule 1: We cannot recommend houses with no Price.
# Action: Drop rows where Price is NaN.
df = df.dropna(subset=['Price'])

# Rule 2: If 'Car' spots are missing, assume 0 (Street Parking).
# Action: Fill NaNs with 0.
df['Car'] = df['Car'].fillna(0)

# ==========================================
# 3. FEATURE ENGINEERING (Requirement FR-02: Logic)
# ==========================================

# BA Assumption: We don't have 'Rental Price' in the raw data.
# Business Rule BR-05: Estimate Annual Rent as 4% of Sale Price (Market Standard).
df['Est_Annual_Rent'] = df['Price'] * 0.04

# User Story Calculation: Can a user earning $80k afford this?
# Logic: Rent must be <= 30% of Income ($24,000).
user_salary_simulation = 80000
affordability_limit = user_salary_simulation * 0.30

# Create the 'Affordable' Flag (Boolean)
df['Is_Affordable'] = df['Est_Annual_Rent'] <= affordability_limit

# ==========================================
# 4. EXPORT (Handover to Developers)
# ==========================================
print(f"Cleaned Row Count: {len(df)}")
print(f"For a client earning approximately ${user_salary_simulation}, there are {len(df[df['Is_Affordable']==True])} affordable properties to rent in Melbourne")

df.to_csv('Cleaned_Melbourne_Data.csv', index=False)

Initial Row Count: 34857
Cleaned Row Count: 27247
For a client earning approximately $80000, there are 5792 affordable properties to rent in Melbourne


In [11]:
from google.colab import files

files.download('Cleaned_Melbourne_Data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>