# UK Housing Price Prediction - Data Loading (2005-2017)

**Author:** Abdul Salam Aldabik  
**Date:** November 2025  
**Course:** CloudAI - Machine Learning Project  
**Dataset:** UK Housing Prices (2005-2017)

---

## Objective
Load the full UK housing dataset with strategic filtering:
- Time range: 2005-2017 (13 years)
- Create temporal features
- Generate summary statistics
- Save processed dataset

## CloudAI Reference
- **Chapter 5:** Data Augmentation - Strategic sampling
- **Chapter 6:** Time Series - Temporal feature extraction

---

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# Visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (14, 6)

print("✓ Libraries loaded")

## 2. Setup Paths

In [None]:
DATA_DIR = Path('../Data')
OUTPUT_DIR = DATA_DIR / 'loading_output'
OUTPUT_DIR.mkdir(exist_ok=True)

DATA_FILE = DATA_DIR / 'price_paid_records.csv'
OUTPUT_FILE = DATA_DIR / 'housing_2005_2017.parquet'

print(f"✓ Output directory: {OUTPUT_DIR}")

## 3. Load Data with Time Filtering

**Strategy:** Load in chunks and filter by date range to manage memory efficiently.

In [None]:
# Date range for filtering
START_DATE = '2005-01-01'
END_DATE = '2017-12-31'

print(f"Loading data from {START_DATE} to {END_DATE}...")
print("This may take several minutes...\n")

# Load in chunks for memory efficiency
chunk_size = 100000
chunks = []

for i, chunk in enumerate(pd.read_csv(DATA_FILE, chunksize=chunk_size, parse_dates=[2]), 1):
    # Clean column names
    chunk.columns = chunk.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('/', '_')
    
    # Filter by date range
    chunk_filtered = chunk[(chunk['date_of_transfer'] >= START_DATE) & 
                           (chunk['date_of_transfer'] <= END_DATE)]
    
    if len(chunk_filtered) > 0:
        chunks.append(chunk_filtered)
    
    # Progress update every 10 chunks
    if i % 10 == 0:
        print(f"  Processed {i * chunk_size:,} rows...")

# Combine all chunks
df = pd.concat(chunks, ignore_index=True)

print(f"\n✓ Data loaded: {len(df):,} transactions")
print(f"✓ Date range: {df['date_of_transfer'].min()} to {df['date_of_transfer'].max()}")
print(f"✓ Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 4. Create Temporal Features

Extract time-based features for analysis and modeling.

In [None]:
# Extract temporal components
df['year'] = df['date_of_transfer'].dt.year
df['month'] = df['date_of_transfer'].dt.month
df['quarter'] = df['date_of_transfer'].dt.quarter
df['year_month'] = df['date_of_transfer'].dt.to_period('M')

print("✓ Temporal features created")
print(f"  Years covered: {df['year'].min()} - {df['year'].max()}")
print(f"  Unique months: {df['year_month'].nunique()}")

## 5. Data Quality Summary

In [None]:
print("=== DATA QUALITY SUMMARY ===")
print(f"Total transactions: {len(df):,}")
print(f"Columns: {len(df.columns)}")
print(f"\nMissing values: {df.isnull().sum().sum()}")
print(f"\nPrice statistics:")
print(f"  Mean: £{df['price'].mean():,.2f}")
print(f"  Median: £{df['price'].median():,.0f}")
print(f"  Range: £{df['price'].min():,.0f} - £{df['price'].max():,.0f}")

## 6. Visualizations

### 6.1 Transaction Volume by Year

In [None]:
yearly_counts = df.groupby('year').size()

fig, ax = plt.subplots(figsize=(14, 6))
bars = ax.bar(yearly_counts.index, yearly_counts.values, color='steelblue', 
              alpha=0.8, edgecolor='black', linewidth=1.2)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height/1000)}K',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.set_xlabel('Year', fontsize=12, fontweight='bold')
ax.set_ylabel('Number of Transactions', fontsize=12, fontweight='bold')
ax.set_title('UK Housing Transactions by Year (2005-2017)', 
             fontsize=14, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)

# Highlight 2008 financial crisis
ax.axvspan(2007.5, 2009.5, alpha=0.2, color='red', label='Financial Crisis')
ax.legend(loc='upper right', fontsize=10)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / '01_yearly_volume.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: 01_yearly_volume.png")

### 6.2 Price Trends Over Time

In [None]:
yearly_price = df.groupby('year')['price'].agg(['mean', 'median'])

fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(yearly_price.index, yearly_price['mean'], 
        marker='o', linewidth=2.5, markersize=8, label='Mean Price', color='#2E86AB')
ax.plot(yearly_price.index, yearly_price['median'], 
        marker='s', linewidth=2.5, markersize=8, label='Median Price', color='#A23B72')

ax.set_xlabel('Year', fontsize=12, fontweight='bold')
ax.set_ylabel('Price (£)', fontsize=12, fontweight='bold')
ax.set_title('UK House Price Trends (2005-2017)', 
             fontsize=14, fontweight='bold', pad=20)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'£{x/1000:.0f}K'))
ax.legend(loc='upper left', fontsize=11)
ax.grid(alpha=0.3)

# Mark financial crisis
ax.axvspan(2007.5, 2009.5, alpha=0.15, color='red')
ax.text(2008.5, yearly_price['mean'].max() * 0.95, 'Financial\nCrisis', 
        ha='center', fontsize=10, fontweight='bold', color='darkred')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / '02_price_trends.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: 02_price_trends.png")

### 6.3 Seasonal Patterns

In [None]:
monthly_counts = df.groupby('month').size()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

fig, ax = plt.subplots(figsize=(14, 6))

colors = sns.color_palette('coolwarm', 12)
bars = ax.bar(range(1, 13), monthly_counts.values, color=colors, 
              alpha=0.8, edgecolor='black', linewidth=1.2)

ax.set_xlabel('Month', fontsize=12, fontweight='bold')
ax.set_ylabel('Total Transactions', fontsize=12, fontweight='bold')
ax.set_title('Seasonal Pattern: Housing Transactions by Month (2005-2017)', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xticks(range(1, 13))
ax.set_xticklabels(month_names)
ax.grid(axis='y', alpha=0.3)

# Highlight spring surge (Mar-May)
ax.axvspan(2.5, 5.5, alpha=0.1, color='green', label='Spring Surge')
ax.legend(loc='upper right', fontsize=10)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / '03_seasonal_pattern.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: 03_seasonal_pattern.png")

### 6.4 Property Type Distribution

In [None]:
prop_counts = df['property_type'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Bar chart with percentages
bars = axes[0].bar(prop_counts.index, prop_counts.values, 
                   color='coral', alpha=0.8, edgecolor='black', linewidth=1.2)
for i, (bar, count) in enumerate(zip(bars, prop_counts.values)):
    pct = (count / len(df)) * 100
    axes[0].text(bar.get_x() + bar.get_width()/2., count,
                f'{count/1000000:.1f}M\n({pct:.1f}%)',
                ha='center', va='bottom', fontsize=10, fontweight='bold')

axes[0].set_xlabel('Property Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count', fontsize=12, fontweight='bold')
axes[0].set_title('Property Type Distribution', fontsize=13, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
colors_pie = sns.color_palette('husl', len(prop_counts))
axes[1].pie(prop_counts.values, labels=prop_counts.index, autopct='%1.1f%%',
           startangle=90, colors=colors_pie, textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[1].set_title('Property Type Percentage', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / '04_property_types.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: 04_property_types.png")

### 6.5 Price Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Full range histogram
axes[0].hist(df['price'], bins=100, color='steelblue', alpha=0.7, edgecolor='black')
axes[0].axvline(df['price'].median(), color='red', linestyle='--', linewidth=2,
               label=f'Median: £{df["price"].median():,.0f}')
axes[0].set_xlabel('Price (£)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[0].set_title('Price Distribution - Full Range', fontsize=13, fontweight='bold')
axes[0].legend(fontsize=10)
axes[0].grid(alpha=0.3)

# Zoomed to 99th percentile
price_99 = df['price'].quantile(0.99)
price_filtered = df[df['price'] <= price_99]['price']
axes[1].hist(price_filtered, bins=100, color='green', alpha=0.7, edgecolor='black')
axes[1].axvline(df['price'].median(), color='red', linestyle='--', linewidth=2)
axes[1].set_xlabel('Price (£)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[1].set_title(f'Price Distribution - Bottom 99% (≤£{price_99:,.0f})', 
                 fontsize=13, fontweight='bold')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / '05_price_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: 05_price_distribution.png")

## 7. Save Processed Data

In [None]:
# Save as Parquet for efficiency
df.to_parquet(OUTPUT_FILE, compression='gzip', index=False)

file_size = OUTPUT_FILE.stat().st_size / 1024**2
print(f"✓ Data saved: {OUTPUT_FILE.name}")
print(f"  File size: {file_size:.2f} MB")
print(f"  Rows: {len(df):,}")
print(f"  Columns: {len(df.columns)}")

## 8. Create Summary Report

In [None]:
summary_file = OUTPUT_DIR / 'loading_summary.txt'

with open(summary_file, 'w') as f:
    f.write("=" * 80 + "\n")
    f.write("UK HOUSING DATA - LOADING SUMMARY (2005-2017)\n")
    f.write("=" * 80 + "\n\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    
    f.write("DATA SELECTION:\n")
    f.write(f"  Time Range: 2005-2017\n")
    f.write(f"  Total Records: {len(df):,}\n\n")
    
    f.write("PRICE STATISTICS:\n")
    f.write(f"  Mean: £{df['price'].mean():,.2f}\n")
    f.write(f"  Median: £{df['price'].median():,.0f}\n")
    f.write(f"  Min: £{df['price'].min():,.0f}\n")
    f.write(f"  Max: £{df['price'].max():,.0f}\n\n")
    
    f.write("TRANSACTIONS BY YEAR:\n")
    f.write(yearly_counts.to_string())
    f.write("\n\n")
    
    f.write("PROPERTY TYPE DISTRIBUTION:\n")
    f.write(prop_counts.to_string())

print(f"\n✓ Summary report saved: {summary_file.name}")

## 9. Summary

### Data Loaded:
- **Records:** 11.1M+ transactions
- **Period:** 2005-2017 (13 years)
- **Features:** Temporal features created

### Key Insights:
1. **2008 Financial Crisis:** Clear drop in transaction volume
2. **Recovery:** Gradual increase from 2013 onwards
3. **Seasonality:** Spring months show higher activity
4. **Property Types:** Terraced houses most common

### Next Steps:
1. Add economic indicators (Bank of England data)
2. Merge datasets
3. Data cleaning and outlier handling
4. Feature engineering

---

**Notebook Complete**