In [None]:
#Import Libraries
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import scipy.stats as ss
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.model_selection import train_test_split

In [None]:
# Visualization Settings
%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)  #This displays all columns in the dataframe
pd.set_option('display.float_format', '{:,.2f}'.format)  # Display describe() with normal float formatting
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
df=pd.read_csv("../data/raw/Melbourne_housing_FULL.csv")

In [None]:
#First 5 rows of the dataset
df.head()

In [None]:
#Structure of the dataset
rows=df.shape[0]
columns=df.shape[1]
print("Number of rows:", rows )
print("Number of columns:", columns)

In [None]:
#Data Types & Missing Counts
print(df.info())

In [None]:
# Convert 'Date' column to datetime object
# 'dayfirst=True' is critical because the dataset uses DD/MM/YYYY format
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

# Verification
df.info()

In [None]:
#Duplicate check(rows)
duplicates=df.duplicated().sum()
print("Number of duplicate rows:", duplicates)

# Show duplicated rows
df[df.duplicated()]

#Remove duplicates
df=df.drop_duplicates(keep='first').reset_index(drop=True)

#Verify
print(f"Duplicates after: {df.duplicated().sum()}")

In [None]:
#Sanity check
display(df.describe().T)

In [None]:
#Check how many impossible BuildingArea values are there (<5.0)
print(f"Count of Impossible BuildingArea values: {len(df[df['BuildingArea'] < 5])}")

# The Fix: Replace values < 5.0 with NaN (Missing) 
df.loc[df['BuildingArea'] < 5, 'BuildingArea'] = np.nan

print(f"Count of Impossible BuildingArea values after fix:{len(df[df['BuildingArea'] < 5])}")

In [None]:
#Get the dynamic current year
current_year=datetime.datetime.now().year

#Check count of invalid years before fix
print(f"Invalid YearBuilt entries before fix: {((df['YearBuilt'] < 1800) | (df['YearBuilt'] > current_year)).sum()}")

#The fix - replace with NaN
df['YearBuilt'] = df['YearBuilt'].where(df['YearBuilt'].between(1800,current_year), np.nan)

# 3. Verify
print(f"Invalid YearBuilt entries after fix: {((df['YearBuilt'] < 1800) | (df['YearBuilt'] > current_year)).sum()}")

In [None]:
#Check count of 0 in bathroom
count_bathroom=(df['Bathroom']==0).sum()
print(f"Count of 0 in Bathroom before :{count_bathroom}")
df['Bathroom']=df['Bathroom'].replace(0, np.nan)
print(f"Count of 0 in Bathroom after :{(df['Bathroom']==0).sum()}")

In [None]:
#Inconsistency Scan
text_cols_to_check = df.select_dtypes(include=['object']).columns
for col in text_cols_to_check:
    if col in df.columns:
        unique_count = df[col].nunique()
        print(f"\n[{col}] Unique Values ({unique_count}):")
        print(sorted(df[col].unique().astype(str)))
        

In [None]:
# Inconsistency Fix (Applied BEFORE split - safe deterministic transformations)

# Suburb: fix casing
df['Suburb'] = df['Suburb'].str.title()

# SellerG: remove branch info & fix casing
df['SellerG'] = df['SellerG'].str.split('/').str[0].str.strip().str.title()

# --- Verification ---
print("--- Suburb ---")
print(sorted(df[df['Suburb'].str.contains('Croydon', case=False)]['Suburb'].unique()))
print(sorted(df[df['Suburb'].str.contains('Viewbank', case=False)]['Suburb'].unique()))

print("\n--- SellerG ---")
brands_to_check = ['Buxton', 'Hockingstuart', 'Vicprop']
for brand in brands_to_check:
    print(f"{brand} variations: {sorted([s for s in df['SellerG'].unique() if brand.lower() in s.lower()])}")

In [None]:
#Missingness Analysis
msno.matrix(df, figsize=(12, 6), sparkline=False);
plt.show()

In [None]:
# Calculate number of missing values per column
missing_count = df.isnull().sum()
missing_count = missing_count.sort_values(ascending=False)

print("Count of Missing Values\n")
print(missing_count)

In [None]:
#Drop rows where 'Price' is missing (Target)
df=df.dropna(subset=['Price'])

#Drop rows where 'Postcode' is missing
df=df.dropna(subset=['Postcode'])

#Verify
print(df.isnull().sum().sort_values(ascending=False))

In [None]:
#Negative/Zero Check of numeric columns

# 1. Select numeric columns but exclude Lat/Long (which can be negative)
numeric_cols = df.select_dtypes(include=np.number).columns
cols_to_check = [col for col in numeric_cols if col not in ['Lattitude', 'Longtitude']]

print(f"--- Checking {len(cols_to_check)} numeric columns for Non-Positive values ---")

# 2. Loop through and report issues
for col in cols_to_check:
    # Count Negatives (Usually Errors for these columns)
    neg_count = (df[col] < 0).sum()
    
    # Count Zeros (Context dependent: Bad for Price, OK for Car)
    zero_count = (df[col] == 0).sum()
    
    if neg_count > 0 or zero_count > 0:
        print(f"‚ö†Ô∏è {col:<15} | Negatives: {neg_count:<5} | Zeros: {zero_count}")

In [None]:
# --- 4. Temporal Analysis (Time Series) ---

# Note: Date column was already converted to datetime in Cell 7 using dayfirst=True
# No need to convert again - just extract Year-Month for grouping
df['YearMonth'] = df['Date'].dt.to_period('M')

print("üìÖ GENERATING TIME SERIES PLOT...")
print("   Goal: Check if the market crashed (which affects how we split the data).")

# 3. Aggregate Average Price per Month
# We count how many houses sold per month too, to make sure the average is reliable
monthly_stats = df.groupby('YearMonth').agg(
    Average_Price=('Price', 'mean'),
    Count=('Price', 'count')
).reset_index()

# Convert back to string for plotting
monthly_stats['YearMonth'] = monthly_stats['YearMonth'].astype(str)

# 4. Plot
plt.figure(figsize=(14, 6))
sns.lineplot(data=monthly_stats, x='YearMonth', y='Average_Price', marker='o', linewidth=2, color='royalblue')

plt.title('Melbourne Housing Market: Price Trend Over Time')
plt.xticks(rotation=45)
plt.xlabel('Month')
plt.ylabel('Average Price ($)')
plt.grid(True, linestyle='--', alpha=0.5)


plt.tight_layout()
plt.show()

In [None]:
# --- 5. Data Splitting Strategy ---
# 1. Define Features (X) and Target (y)
X = df.drop('Price', axis=1)
y = df['Price']

# 2. Perform Random Split (User Decision: Random Split for Generalization)
# shuffle=True ensures we mix 2016, 2017, and 2018 data together
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)



# 3. Verify
print("--- Random Split Successful ---")
print(f"Training Set: {X_train.shape[0]} rows")
print(f"Test Set:     {X_test.shape[0]} rows")

# 4. SAFETY CHECK
assert len(X_train) + len(X_test) == len(df), "Error: Row mismatch!"
print("Safety Check Passed.")

In [None]:
#Cardinality Check
categorical_cols = X_train.select_dtypes(include=['object']).columns

print("\nCardinality Check (Count of Unique Values)")
cardinality = X_train[categorical_cols].nunique().sort_values(ascending=False)
print(cardinality)


In [None]:
#Target Variable Analysis
plt.figure(figsize=(14, 5))

# Plot Histogram (Distribution)
sns.histplot(y_train.dropna(), kde=True, color='blue')
plt.title('Price Distribution')


plt.tight_layout()
plt.show()



In [None]:
#Target Variable Analysis
plt.figure(figsize=(14, 5))
# Plot Q-Q Plot (Check Normality)
ss.probplot(y_train.dropna(), dist="norm", plot=plt)
plt.title('Q-Q Plot (Normality Check)')

plt.tight_layout()
plt.show()



In [None]:
#Target Variable Analysis
plt.figure(figsize=(14, 5))
# Plot Boxplot (Outliers)

sns.boxplot(x=y_train, color='cyan')
plt.title('Price Boxplot')

plt.tight_layout()
plt.show()


In [None]:
# Calculate Skewness & Kurtosis
price_skew = y_train.skew()
price_kurt = y_train.kurt()

print("Target Statistics")
print(f"Skewness: {price_skew:.4f} (Rule: > 1.0 needs Log Transform)")
print(f"Kurtosis: {price_kurt:.4f}")

In [None]:
# --- Data Integrity Check: Are there decimals in Discrete variables? ---

# List of columns that SHOULD be integers
discrete_cols = ['Rooms', 'Bedroom2', 'Bathroom', 'Car', 'Postcode', 'YearBuilt', 'Propertycount']

print("Checking for non-integer values (decimals)...\n")

for col in discrete_cols:
    if col in X_train.columns:
        # Check if any value has a non-zero decimal part (e.g. 2.5)
        # We drop NA first because NaN is technically a float
        has_decimals = (X_train[col].dropna() % 1 != 0).any()
        
        if has_decimals:
            print(f"ALERT: '{col}' contains decimals! (e.g., 2.5)")
            # Show examples
            examples = X_train[col][X_train[col] % 1 != 0].head(3).tolist()
            print(f"   Examples: {examples}")
        else:
            print(f"'{col}' is clean (Integers only).")

In [None]:
# --- Smart Numeric Analysis: Distribution + Outliers + Skewness ---

# 1. Select all numeric columns
all_numeric = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()

# 2. Filter out columns we don't need
drop_for_plot = ['Price', 'Postcode',] 
cols_to_plot = [c for c in all_numeric if c not in drop_for_plot]

print(f"Scanning {len(cols_to_plot)} Numeric Features...\n")

# 3. The Smart Loop
for col in cols_to_plot:
    unique_count = X_train[col].nunique()
    skew_val = X_train[col].skew()
    
    # Calculate IQR outlier count
    Q1, Q3 = X_train[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    outlier_count = ((X_train[col] < Q1 - 1.5*IQR) | (X_train[col] > Q3 + 1.5*IQR)).sum()
    
    plt.figure(figsize=(12, 4))
    
    # --- BRANCH A: Discrete / Count Variables (e.g., Rooms, Car) ---
    if unique_count < 25:
        plt.subplot(1, 2, 1)
        sns.histplot(X_train[col].dropna(), discrete=True, kde=True, color='purple')
        plt.title(f'{col} (Discrete) | Skew: {skew_val:.2f}')
        plt.xlabel(col)
        plt.ylabel('Count')
        
        plt.subplot(1, 2, 2)
        sns.boxplot(x=X_train[col].dropna(), color='cyan')
        plt.title(f'{col} Outliers ({outlier_count})')
        
        # Stats summary for discrete
        stats_msg = f"[{col}] Mode: {X_train[col].mode()[0]} | Skew: {skew_val:.4f} | Outliers: {outlier_count}"
    # --- BRANCH B: Continuous Variables (e.g., Landsize, Distance) ---
    else:
        plt.subplot(1, 2, 1)
        sns.histplot(X_train[col].dropna(), kde=True, bins=30, color='blue')
        plt.title(f'{col} (Continuous) | Skew: {skew_val:.2f}')
        
        plt.subplot(1, 2, 2)
        sns.boxplot(x=X_train[col].dropna(), color='orange')
        plt.title(f'{col} Outliers ({outlier_count})')
        
        # Stats summary for continuous
        stats_msg = f"[{col}] Min: {X_train[col].min()} | Max: {X_train[col].max()} | Skew: {skew_val:.4f} | Outliers: {outlier_count}"
    plt.tight_layout()
    plt.show()
    print(stats_msg)
    print("-" * 60)

In [None]:
# --- 4. Categorical Feature Distributions (Automated & Robust) ---

# 1. Auto-Select Categorical Columns
# We grab all object columns, but we EXCLUDE 'Address' and 'Date' 
# because they are unique identifiers/time series, not categories.
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
cols_to_ignore = ['Address'] 
cat_cols = [col for col in cat_cols if col not in cols_to_ignore]

# 2. Define High Cardinality Threshold
# If a column has > 50 unique values, plotting it is messy. We will just print stats.
high_cardinality_cutoff = 50 

print(f"Scanning {len(cat_cols)} Categorical Features\n")

for col in cat_cols:
    unique_count = X_train[col].nunique()
    # --- STRATEGY A: HIGH CARDINALITY (e.g., Suburb, SellerG) ---
    if unique_count > high_cardinality_cutoff:
        print(f"ANALYZING: {col}")
        print(f"{col} has {unique_count} unique values (Too many to plot).")
        
        # Calculate percentages
        top_5_series = X_train[col].value_counts(normalize=True).head(5) * 100
        
        # Fix: Format EACH number individually to include '%'
        print("Top 5 Categories:",top_5_series.apply(lambda x: f"{x:.2f}%").to_string())
        
        # Quick check for rare labels
        freq = X_train[col].value_counts(normalize=True) * 100
        rare_count = len(freq[freq < 1.0])
        print(f"Contains {rare_count} rare categories (<1%) that will need encoding.")
        print("-" * 60)
        continue
    # --- STRATEGY B: PLOT-ABLE FEATURES (e.g., Type, Method, CouncilArea) ---
    print(f"ANALYZING: {col}")
    print(f"Unique Categories: {unique_count}")

    # 1. Rare Label Check (< 1%)
    freq = X_train[col].value_counts(normalize=True) * 100
    rare_cats = freq[freq < 1.0] # Threshold: 1%
    
    if not rare_cats.empty:
        print(f"Rare Labels Found (<1%): {len(rare_cats)}")
        print(f"Examples: {rare_cats.index.tolist()}") # Show first 5
    else:
        print("No Rare Labels found.")

    # 2. Visualization
    plt.figure(figsize=(10, 6) if unique_count > 10 else (8, 5))
    
    # Logic: Use Horizontal Bars if there are many categories (like CouncilArea)
    if unique_count > 10:
        sns.countplot(y=X_train[col], order=X_train[col].value_counts().index, palette="viridis")
        plt.title(f'{col} Distribution (Horizontal View)')
    else:
        sns.countplot(x=X_train[col], order=X_train[col].value_counts().index, palette="viridis")
        plt.title(f'{col} Distribution')
        plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()
    print("-" * 60)

In [None]:
# --- 1. Correlation & Multicollinearity Check ---

# Select only numeric columns for correlation analysis
numeric_df = X_train.select_dtypes(include=['number']).copy()
numeric_df = numeric_df.drop(columns=['Price'], errors='ignore')

# A. Correlation Heatmap
plt.figure(figsize=(12, 8))
corr_matrix = numeric_df.corr(numeric_only=True)

mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap (Numeric Features)')
plt.show()

# B. VIF
print("\nüîπ CALCULATING VIF (Multicollinearity Check)...")
print("   Rule: VIF > 5 is suspicious. VIF > 10 is definitely redundant.\n")

# Drop NaNs
vif_data = numeric_df.dropna()

# Add constant
vif_data = add_constant(vif_data)

vif_df = pd.DataFrame()
vif_df["Feature"] = vif_data.columns
vif_df["VIF_Score"] = [
    variance_inflation_factor(vif_data.values, i)
    for i in range(vif_data.shape[1])
]

# Drop const row
vif_df = vif_df[vif_df["Feature"] != "const"]

# Show sorted output
print(vif_df.sort_values(by="VIF_Score", ascending=False).to_string(index=False))

In [None]:
# ===============================
# Categorical vs Categorical (Redundancy Check)
# ===============================

# 1. Define Categorical Columns to Check
# Avoid extremely high-cardinality columns (e.g., Suburb, SellerG)
available_cols = X_train.columns.tolist()
target_cols = ['Suburb', 'Type', 'Method', 'Regionname', 'CouncilArea']

# Filter to ensure we don't crash if a column was dropped earlier
cat_features = [col for col in target_cols if col in available_cols]
# ---------- Cramer's V Function (Safe & Correct) ----------
def cramers_v(x, y):
    # Case 1: Same column ‚Üí Perfect association
    if x.name == y.name:
        return 1.0

    # Case 2: A column with only 1 unique category ‚Üí No association possible
    if x.nunique() < 2 or y.nunique() < 2:
        return 0.0

    # Contingency table
    confusion_matrix = pd.crosstab(x, y)

    # Chi-square
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape

    # Bias correction
    with np.errstate(divide='ignore', invalid='ignore'):
        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
        rcorr = r - ((r - 1)**2) / (n - 1)
        kcorr = k - ((k - 1)**2) / (n - 1)

        denom = min((kcorr - 1), (rcorr - 1))
        if denom <= 0:
            return 0.0
        return np.sqrt(phi2corr / denom)

# ---------- Calculate Cramer's V Matrix ----------
print("üîÑ CALCULATING CATEGORICAL ASSOCIATIONS (Cramer's V)...")
print("   Goal: Find redundant categorical features (Score > 0.80 means redundancy).\n")

rows = []
for var1 in cat_features:
    col_values = []
    for var2 in cat_features:
        v = cramers_v(X_train[var1], X_train[var2])
        col_values.append(v)
    rows.append(col_values)

cramers_df = pd.DataFrame(rows, columns=cat_features, index=cat_features)
cramers_df.fillna(0, inplace=True)

# ---------- Plot the Heatmap ----------
plt.figure(figsize=(10, 8))
sns.heatmap(cramers_df, annot=True, fmt=".2f", cmap='Greens', vmin=0, vmax=1)
plt.title("Categorical Association Heatmap (Cramer's V)")
plt.show()

# Print Table
print("\nCramer's V Association Table:")
print(cramers_df)


In [None]:
# --- 3. Numeric vs Target (Individual Scatter Plots) ---

# Create a temp dataframe combining X and y for plotting
# We use a copy so we don't accidentally modify the real X_train
plot_df = X_train.copy()
plot_df['Price'] = y_train

# Select key numeric features to check
features_to_plot = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt', 'Car', 'Propertycount']

print(f"üìä Generating Individual Scatter Plots for {len(features_to_plot)} features vs Price...\n")

for col in features_to_plot:
    # Check if column exists (just in case you dropped one earlier)
    if col not in plot_df.columns: continue

    plt.figure(figsize=(10, 6)) # Large, clear size for each plot
    
    # Plot with hue='Type' to see if Houses (h) behave differently than Units (u)
    sns.scatterplot(
        data=plot_df.sample(n=min(len(plot_df), 2000), random_state=42), # Sample 2000 points for speed/clarity
        x=col, 
        y='Price', 
        hue='Type', 
        alpha=0.6, 
        palette='Set1'
    )
    
    plt.title(f"Price vs {col}")
    plt.xlabel(col)
    plt.ylabel("Price")
    plt.grid(True, alpha=0.3) # Add gridlines for readability
    plt.show()
    
    print("-" * 80)

In [None]:
# --- 3. Categorical vs Target (Boxplots) ---

# Define the categorical features to check
# We stick to the manageable ones (Suburb/Council are too big to plot here)
plot_df = X_train.copy()
plot_df['Price'] = y_train

cat_cols = ['Type', 'Method', 'Regionname'] 
target = 'Price'

print("GENERATING BOXPLOTS...")
print("   Goal: Look for categories that separate the price (boxes at different heights).\n")

for col in cat_cols:
    if col not in plot_df.columns: continue

    plt.figure(figsize=(12, 6))
    
    # Sort the order by Median Price so the chart is easy to read (Low -> High)
    order = plot_df.groupby(col)[target].median().sort_values().index
    
    # Create the boxplot
    sns.boxplot(data=plot_df, x=col, y=target, order=order, palette='coolwarm')
    
    plt.title(f'{col} vs Price Distribution')
    plt.xticks(rotation=45, ha='right') # Rotate labels for readability
    plt.grid(True, axis='y', alpha=0.3)
    plt.ylabel('Price')
    plt.xlabel(col)
    plt.show()

print("-" * 60)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

# --- Step 1: Encode categorical features to integers ---
X_train_encoded = X_train.copy()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col].astype(str))
    label_encoders[col] = le  # store for later use if needed

# --- Step 2: Impute missing numeric values (median) ---
for col in numeric_cols:
    X_train_encoded[col] = X_train_encoded[col].fillna(X_train_encoded[col].median())

# --- Step 3: Compute Mutual Information ---
mi_results = pd.DataFrame(index=numeric_cols, columns=categorical_cols)

for cat_col in categorical_cols:
    mi = mutual_info_classif(
        X_train_encoded[numeric_cols],   # predictors (numeric)
        X_train_encoded[cat_col],        # categorical "target"
        discrete_features=False,
        random_state=42
    )
    mi_results[cat_col] = mi

print(mi_results)

# --- Step 4: Interpret results ---
# Set a threshold for high MI indicating redundancy (example: 0.1)
threshold = 0.5
redundant_numeric_features = mi_results.index[(mi_results > threshold).any(axis=1)].tolist()

print("‚úÖ Mutual Information computed for numeric vs categorical features")
print("Potentially redundant numeric features based on MI > threshold:")
print(redundant_numeric_features)

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
sns.heatmap(mi_results.astype(float), annot=True, cmap='Blues')
plt.title("Mutual Information (Numeric vs Categorical)")
plt.show()