# Data Verification: Boston Properties

This notebook loads and verifies the exported Boston properties data.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path


## Load Data


In [None]:
# Load the CSV file
data_path = Path("../data/raw/boston_properties.csv")
df = pd.read_csv(data_path)

print(f"Data loaded successfully from: {data_path}")


## Basic Information


In [None]:
print("=" * 50)
print("BASIC INFORMATION")
print("=" * 50)
print(f"Total rows: {len(df):,}")
print(f"Total columns: {len(df.columns)}")
print(f"Shape: {df.shape}")
print(f"\nColumn names: {', '.join(df.columns.tolist())}")


## Data Types


In [None]:
print("=" * 50)
print("DATA TYPES")
print("=" * 50)
print(df.dtypes)


## Missing Values


In [None]:
print("=" * 50)
print("MISSING VALUES")
print("=" * 50)
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing.index,
    'Missing Count': missing.values,
    'Missing Percentage': missing_pct.values
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("No missing values found!")


## First 5 Rows


In [None]:
print("=" * 50)
print("FIRST 5 ROWS")
print("=" * 50)
df.head()


## Last 5 Rows


In [None]:
print("=" * 50)
print("LAST 5 ROWS")
print("=" * 50)
df.tail()


## Basic Statistics for Numerical Columns


In [None]:
print("=" * 50)
print("NUMERICAL STATISTICS")
print("=" * 50)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numerical_cols) > 0:
    print(f"Numerical columns: {', '.join(numerical_cols)}")
    print("\n")
    print(df[numerical_cols].describe())
else:
    print("No numerical columns found.")


## Primary Key Identification


In [None]:
print("=" * 50)
print("PRIMARY KEY CHECK")
print("=" * 50)

# Check for common primary key column names
pk_candidates = ['id', 'property_id', 'parcel_id', 'parcel_number', 'pid']

found_pk = None
for col in pk_candidates:
    if col in df.columns:
        if df[col].is_unique:
            found_pk = col
            print(f"✓ Found unique column: {col}")
            print(f"  Total unique values: {df[col].nunique():,}")
            print(f"  Total rows: {len(df):,}")
            break

if not found_pk:
    print("Checking all columns for uniqueness...")
    for col in df.columns:
        if df[col].is_unique:
            print(f"✓ Found unique column: {col}")
            print(f"  Total unique values: {df[col].nunique():,}")
            found_pk = col
            break
    
    if not found_pk:
        print("✗ No unique column found (potential primary key)")


## Duplicate Rows Check


In [None]:
print("=" * 50)
print("DUPLICATE ROWS CHECK")
print("=" * 50)

duplicate_count = df.duplicated().sum()
print(f"Total duplicate rows: {duplicate_count:,}")

if duplicate_count > 0:
    duplicate_pct = (duplicate_count / len(df)) * 100
    print(f"Percentage of duplicates: {duplicate_pct:.2f}%")
    print("\nSample duplicate rows:")
    print(df[df.duplicated(keep=False)].head(10))
else:
    print("✓ No duplicate rows found!")
