In [3]:
# ==========================================
# HOMEWORK 1: Data Collection and Analysis
# Student Name: [Your Name]
# City: London
# Data Source: Real REST APIs (OpenWeather, OpenAQ, London Datastore)
# ==========================================

import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
from datetime import datetime, timedelta

# Machine Learning Imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Visualization Settings
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries loaded successfully.")

Libraries loaded successfully.


In [4]:
# CONFIGURATION
# ==========================================
CITY = "London"
LAT = 51.5074
LON = -0.1278

# Time Period: Jan 1, 2024 to June 30, 2024
START_DATE = datetime(2024, 1, 1)
END_DATE = datetime(2024, 6, 30)

# API KEYS
OPENWEATHER_API_KEY = "325b69ff5a70f7ff71ba31d1474d270f"
OPENAQ_API_KEY = "2f891ea1344c958c5486d58aca8fe88b3b02e01043140eb06847fb8e1f4cd75b"

# File Paths (Local storage)
WEATHER_FILE = "london_weather_real.csv"
AIR_QUALITY_FILE = "london_aq_real.csv"
MOBILITY_FILE = "london_mobility_real.csv"

In [5]:
def fetch_weather_history():
    print(f"Starting Weather Collection for {CITY} ({START_DATE.date()} to {END_DATE.date()})...")
    
    url = "https://api.openweathermap.org/data/3.0/onecall/timemachine"
    data_records = []
    current_date = START_DATE
    
    while current_date <= END_DATE:
        # Unix timestamp for the specific day at noon
        dt_stamp = int(current_date.replace(hour=12, minute=0).timestamp())
        
        params = {
            'lat': LAT,
            'lon': LON,
            'dt': dt_stamp,
            'appid': OPENWEATHER_API_KEY,
            'units': 'metric'
        }
        
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                resp_json = response.json()
                # 'data' is a list of hourly entries for that requested time
                # We take the first one (closest to noon)
                if 'data' in resp_json and len(resp_json['data']) > 0:
                    day_data = resp_json['data'][0]
                    data_records.append({
                        'date': current_date.strftime('%Y-%m-%d'),
                        'temp': day_data.get('temp'),
                        'humidity': day_data.get('humidity'),
                        'wind_speed': day_data.get('wind_speed'),
                        'pressure': day_data.get('pressure'),
                        'condition': day_data['weather'][0]['main'] if 'weather' in day_data else 'Unknown'
                    })
            else:
                print(f"Skipping {current_date.date()}: API Status {response.status_code}")
                
        except Exception as e:
            print(f"Error on {current_date.date()}: {e}")
        
        # Advance one day
        current_date += timedelta(days=1)
        # Polite delay
        time.sleep(0.1)
        
    # Save
    if data_records:
        df = pd.DataFrame(data_records)
        df.to_csv(WEATHER_FILE, index=False)
        print(f"Success: Saved {len(df)} weather records to {WEATHER_FILE}")
    else:
        print("Failure: No weather data collected. Check API Key subscription type.")

# Execute Collection
fetch_weather_history()

Starting Weather Collection for London (2024-01-01 to 2024-06-30)...
Skipping 2024-01-01: API Status 401
Skipping 2024-01-02: API Status 401
Skipping 2024-01-03: API Status 401
Skipping 2024-01-04: API Status 401
Skipping 2024-01-05: API Status 401
Skipping 2024-01-06: API Status 401
Skipping 2024-01-07: API Status 401
Skipping 2024-01-08: API Status 401
Skipping 2024-01-09: API Status 401
Skipping 2024-01-10: API Status 401
Skipping 2024-01-11: API Status 401
Skipping 2024-01-12: API Status 401
Skipping 2024-01-13: API Status 401
Skipping 2024-01-14: API Status 401
Skipping 2024-01-15: API Status 401
Skipping 2024-01-16: API Status 401
Skipping 2024-01-17: API Status 401
Skipping 2024-01-18: API Status 401
Skipping 2024-01-19: API Status 401
Skipping 2024-01-20: API Status 401
Skipping 2024-01-21: API Status 401
Skipping 2024-01-22: API Status 401
Skipping 2024-01-23: API Status 401
Skipping 2024-01-24: API Status 401
Skipping 2024-01-25: API Status 401
Skipping 2024-01-26: API Status

In [6]:
def fetch_air_quality():
    print(f"Starting Air Quality Collection for {CITY}...")
    
    url = "https://api.openaq.org/v2/measurements"
    headers = {"X-API-Key": OPENAQ_API_KEY}
    
    params = {
        'city': 'London',
        'country': 'GB',
        'parameter': 'pm25',
        'date_from': START_DATE.strftime('%Y-%m-%d'),
        'date_to': END_DATE.strftime('%Y-%m-%d'),
        'limit': 10000, 
        'order_by': 'datetime'
    }
    
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            results = response.json().get('results', [])
            records = []
            
            for item in results:
                # We extract the UTC date and value
                records.append({
                    'date': item['date']['utc'][:10], # Extract YYYY-MM-DD
                    'pm25': item['value'],
                    'unit': item['unit']
                })
            
            if records:
                df = pd.DataFrame(records)
                # Group by date to get daily average (sensors report hourly)
                df_daily = df.groupby('date')['pm25'].mean().reset_index()
                df_daily.to_csv(AIR_QUALITY_FILE, index=False)
                print(f"Success: Saved {len(df_daily)} daily air quality records to {AIR_QUALITY_FILE}")
            else:
                print("Warning: No records found for this date range/city.")
        else:
            print(f"API Error: {response.status_code} - {response.text}")
            
    except Exception as e:
        print(f"Exception: {e}")

# Execute Collection
fetch_air_quality()

Starting Air Quality Collection for London...
API Error: 410 - {"message": "Gone. Version 1 and Version 2 API endpoints are retired and no longer available. Please migrate to Version 3 endpoints."}


In [7]:
def fetch_mobility():
    print("Fetching TfL Mobility Data...")
    # Direct CSV link for TfL Journeys (Publicly available, reliable URL)
    csv_url = "https://data.london.gov.uk/download/tfl-journeys-type/f3d9796d-37bf-42a3-92c1-c4269a6572eb/tfl-journeys-type.csv"
    
    try:
        df = pd.read_csv(csv_url)
        
        # Rename columns to standardized names
        # The dataset usually has 'Period and Financial Year' or 'Reporting Period'
        df.to_csv(MOBILITY_FILE, index=False)
        print(f"Success: Downloaded mobility data to {MOBILITY_FILE}")
        
    except Exception as e:
        print(f"Error fetching mobility data: {e}")

# Execute Collection
fetch_mobility()

Fetching TfL Mobility Data...
Error fetching mobility data: HTTP Error 404: Not Found


In [8]:
print("Processing and Merging Data...")

# 1. Load Data
df_weather = pd.read_csv(WEATHER_FILE)
df_aq = pd.read_csv(AIR_QUALITY_FILE)
df_mobility = pd.read_csv(MOBILITY_FILE)

# 2. Date Conversion
df_weather['date'] = pd.to_datetime(df_weather['date'])
df_aq['date'] = pd.to_datetime(df_aq['date'])

# 3. Clean Mobility Data
# The TfL file is often summarized by "Period". We need to approximate or filter.
# We will create a helper column to merge on Month-Year.
df_mobility['period_start'] = pd.to_datetime(df_mobility['Period beginning'], errors='coerce')
# Filter for our range (roughly)
mask = (df_mobility['period_start'] >= '2023-12-01') & (df_mobility['period_start'] <= '2024-07-01')
df_mob_filtered = df_mobility.loc[mask].copy()

# Simplify: Aggregate total journeys per period (Bus + Tube + etc)
# Summing all transport modes (columns after specific indices usually hold the counts)
# We will just pick 'Bus journeys (m)' and 'Underground journeys (m)' as proxies
df_mob_filtered['total_trips'] = df_mob_filtered['Bus journeys (m)'] + df_mob_filtered['Underground journeys (m)']
df_mob_filtered = df_mob_filtered[['period_start', 'total_trips']]

# 4. Merging Strategy: Forward Fill Mobility
# Since mobility is monthly/4-weekly, we assign the same traffic value to every day in that period.
df_base = pd.merge(df_weather, df_aq, on='date', how='inner')

# Sort to ensure proper filling
df_base = df_base.sort_values('date')
df_mob_filtered = df_mob_filtered.sort_values('period_start')

# Merge using 'merge_asof' to find the closest previous mobility period
df_final = pd.merge_asof(
    df_base, 
    df_mob_filtered, 
    left_on='date', 
    right_on='period_start', 
    direction='backward'
)

# Final Cleanup
df_final = df_final.dropna()
df_final['day_of_week'] = df_final['date'].dt.day_name()

print(f"Final Data Shape: {df_final.shape}")
df_final.head()

Processing and Merging Data...


FileNotFoundError: [Errno 2] No such file or directory: 'london_weather_real.csv'

In [None]:
# 1. Correlations
plt.figure(figsize=(10, 6))
numeric_df = df_final.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix: Weather, PM2.5, and Mobility")
plt.show()

# 2. Time Series
fig, ax1 = plt.subplots(figsize=(14, 6))

color = 'tab:red'
ax1.set_xlabel('Date')
ax1.set_ylabel('PM2.5 Level', color=color)
ax1.plot(df_final['date'], df_final['pm25'], color=color, label='PM2.5')
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  
color = 'tab:blue'
ax2.set_ylabel('Temperature (Â°C)', color=color)
ax2.plot(df_final['date'], df_final['temp'], color=color, linestyle='--', label='Temp')
ax2.tick_params(axis='y', labelcolor=color)

plt.title("PM2.5 Levels vs Temperature in London (2024)")
fig.tight_layout()
plt.show()

In [None]:
# Task: Predict PM2.5 using Weather and Mobility data
features = ['temp', 'humidity', 'wind_speed', 'total_trips']
target = 'pm25'

# Prepare Data
X = df_final[features]
y = df_final[target]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Models
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

print("Models Trained.")

In [None]:
# Evaluation Function
def print_metrics(model_name, y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"[{model_name}] RMSE: {rmse:.4f} | MAE: {mae:.4f} | R2: {r2:.4f}")

# Predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_rf = rf_model.predict(X_test_scaled)

print("--- Model Evaluation ---")
print_metrics("Linear Regression", y_test, y_pred_lr)
print_metrics("Random Forest    ", y_test, y_pred_rf)

# Feature Importance (Random Forest)
plt.figure(figsize=(8, 4))
importances = pd.Series(rf_model.feature_importances_, index=features).sort_values(ascending=False)
importances.plot(kind='bar')
plt.title("Feature Importance for Predicting PM2.5")
plt.ylabel("Importance Score")
plt.tight_layout()
plt.show()