# Solar Generation vs Cloud Cover Analysis

This notebook analyzes how cloud cover affects solar energy generation in California.

**Data Sources:**
- Solar generation: CAISO via GridStatus API
- Weather/cloud cover: NOAA Integrated Surface Database (ISD)

In [None]:
import gridstatus
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
import requests
import os

# Set up data directory
DATA_DIR = '../data/raw'
os.makedirs(DATA_DIR, exist_ok=True)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
%matplotlib inline

print("✓ Setup complete")

## 1. Fetch Solar Generation Data (CAISO)

We'll use GridStatus to get solar generation data from California's grid operator.

In [None]:
# Initialize CAISO
caiso = gridstatus.CAISO()

# Define date range (using 2024 to match available weather data)
start_date = datetime(2024, 1, 1)  # Summer months have more solar variation
end_date = datetime(2024, 12, 30)
start_str = start_date.strftime('%Y-%m-%d')
end_str = end_date.strftime('%Y-%m-%d')

# Fetch or load cached data
cache_file = f'{DATA_DIR}/solar_fuel_mix_{start_str}_{end_str}.csv'

if os.path.exists(cache_file):
    print(f"Loading from cache: {cache_file}")
    fuel_mix = pd.read_csv(cache_file, parse_dates=['Time'])
else:
    print(f"Fetching solar data from {start_str} to {end_str}...")
    fuel_mix = caiso.get_fuel_mix(start=start_str, end=end_str)
    fuel_mix.to_csv(cache_file, index=False)
    print(f"✓ Saved to cache")

print(f"\n✓ Loaded {len(fuel_mix):,} observations")
print(f"Date range: {fuel_mix['Time'].min()} to {fuel_mix['Time'].max()}")
fuel_mix.head()


## 2. Visualize Solar Generation

In [None]:
# Plot solar generation over time
fig = px.line(
    fuel_mix, 
    x='Time', 
    y='Solar',
    title=f'Solar Generation (CAISO) - {start_str} to {end_str}',
    labels={'Solar': 'Megawatts', 'Time': 'Date/Time'}
)
fig.update_layout(
    hovermode='x unified',
    xaxis_title='Date/Time',
    yaxis_title='Solar Generation (MW)',
    height=500
)
fig.show()

# Stats
print(f"\nSolar Generation Statistics:")
print(f"  Peak: {fuel_mix['Solar'].max():,} MW")
print(f"  Mean (daytime): {fuel_mix[fuel_mix['Solar'] > 0]['Solar'].mean():.0f} MW")
print(f"  Peak time: {fuel_mix.loc[fuel_mix['Solar'].idxmax(), 'Time']}")

## 3. Find California Weather Stations

Download the NOAA ISD station inventory to find weather stations in California.

In [None]:
# Download NOAA ISD station inventory
station_url = "https://www.ncei.noaa.gov/pub/data/noaa/isd-history.txt"
station_cache = f"{DATA_DIR}/isd_station_history.txt"

if not os.path.exists(station_cache):
    print("Downloading station inventory...")
    response = requests.get(station_url)
    with open(station_cache, 'wb') as f:
        f.write(response.content)
    print("✓ Downloaded")
else:
    print("✓ Loading from cache")

# Parse fixed-width format
colspecs = [
    (0, 6), (7, 12), (13, 42), (43, 45), (48, 50), (51, 56),
    (57, 64), (65, 73), (74, 81), (82, 90), (91, 99)
]
names = ['USAF', 'WBAN', 'STATION_NAME', 'CTRY', 'ST', 'ICAO', 
         'LAT', 'LON', 'ELEV', 'BEGIN', 'END']

stations = pd.read_fwf(station_cache, colspecs=colspecs, names=names, skiprows=20)

# Filter for California with valid coordinates
ca_stations = stations[stations['ST'] == 'CA'].copy()
ca_stations['LAT'] = pd.to_numeric(ca_stations['LAT'], errors='coerce') / 1000
ca_stations['LON'] = pd.to_numeric(ca_stations['LON'], errors='coerce') / 1000
ca_stations = ca_stations.dropna(subset=['LAT', 'LON'])

# Create station ID
ca_stations['STATION_ID'] = (ca_stations['USAF'].astype(str).str.zfill(6) + 
                               ca_stations['WBAN'].astype(str).str.zfill(5))

# Filter for recent data
ca_stations['END'] = pd.to_datetime(ca_stations['END'], format='%Y%m%d', errors='coerce')
recent_stations = ca_stations[ca_stations['END'] >= '2024-06-01'].copy()

print(f"\n✓ Found {len(recent_stations)} California stations with 2024 data")
print("\nTop 10 stations (by latitude):")
print(recent_stations.sort_values('LAT', ascending=False)[
    ['STATION_ID', 'STATION_NAME', 'LAT', 'LON']
].head(10).to_string(index=False))


## 4. Download Weather Data

Download hourly weather observations including cloud cover from NOAA ISD.

**Station:** Sacramento International Airport (central CA, good solar proxy)


In [None]:
def download_isd_data(station_id, year):
    """Download ISD weather data for a station/year"""
    base_url = "https://www.ncei.noaa.gov/data/global-hourly/access"
    url = f"{base_url}/{year}/{station_id}.csv"
    cache_file = f"{DATA_DIR}/isd_{station_id}_{year}.csv"
    
    if os.path.exists(cache_file):
        print(f"✓ Loading from cache: {cache_file}")
        return pd.read_csv(cache_file, low_memory=False)
    
    print(f"Downloading: {url}")
    response = requests.get(url)
    response.raise_for_status()
    
    with open(cache_file, 'wb') as f:
        f.write(response.content)
    print(f"✓ Saved to cache")
    
    return pd.read_csv(cache_file, low_memory=False)

# Sacramento International Airport
SACRAMENTO_STATION = '72483023232'

weather_data = download_isd_data(SACRAMENTO_STATION, 2024)
print(f"\n✓ Loaded {len(weather_data):,} weather observations")
print(f"Columns: {len(weather_data.columns)}")
weather_data.head()

## 5. Parse Cloud Cover Data

Extract cloud cover from the GD1 column (Sky Cover Summation).
- Cloud cover is measured in **oktas** (0-8 scale)
- 0 = clear sky, 8 = completely overcast, 9 = fog/snow other element blocking view of sky


In [None]:
def parse_cloud_cover(gd1_value):
    """Extract cloud cover in oktas (0-8) from GD1 field"""
    if pd.isna(gd1_value):
        return None
    try:
        oktas = int(str(gd1_value).split(',')[0])
        return oktas if oktas != 9 else None  # 9 = missing
    except:
        return None

# Parse cloud cover
weather_data['datetime'] = pd.to_datetime(weather_data['DATE'])
weather_data['cloud_cover_oktas'] = weather_data['GD1'].apply(parse_cloud_cover)
weather_data['cloud_cover_pct'] = weather_data['cloud_cover_oktas'] / 8 * 100

# Filter to our date range
weather_filtered = weather_data[
    (weather_data['datetime'] >= start_date) & 
    (weather_data['datetime'] <= end_date)
].copy()

print(f"✓ Parsed cloud cover for {len(weather_filtered):,} observations")
print(f"\nCloud cover distribution (oktas):")
print(weather_filtered['cloud_cover_oktas'].value_counts().sort_index())
print(f"\nMean cloud cover: {weather_filtered['cloud_cover_oktas'].mean():.1f} oktas ({weather_filtered['cloud_cover_pct'].mean():.1f}%)")
