# Solar Power Prediction - Data Exploration

This notebook explores the solar power generation dataset and weather data to understand patterns and relationships.

## Objectives
- Load and examine the dataset structure
- Analyze power generation patterns
- Explore weather data correlations
- Visualize key relationships

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette('husl')

print("Libraries imported successfully!")

## 1. Load Sample Data

For this demonstration, we'll load a sample of the processed data. In a full implementation, you would load your raw data files.

In [None]:
# Load processed sample data
try:
    data = pd.read_csv('../data/processed_solar_sample.csv')
    print(f"Dataset loaded successfully!")
    print(f"Shape: {data.shape}")
    print(f"Columns: {list(data.columns)}")
except FileNotFoundError:
    print("Sample data not found. Please run the preprocessing script first.")
    # Create sample data for demonstration
    np.random.seed(42)
    n_samples = 1000
    data = pd.DataFrame({
        'Time': pd.date_range('2023-01-01', periods=n_samples, freq='H'),
        'Power_W': np.random.normal(2000, 500, n_samples),
        'Irradiance': np.random.normal(400, 200, n_samples),
        'Temperature': np.random.normal(25, 5, n_samples),
        'Hour': np.random.randint(0, 24, n_samples)
    })
    print("Created sample data for demonstration")

In [None]:
# Display basic information about the dataset
print("Dataset Info:")
print(data.info())
print("\nFirst 5 rows:")
data.head()

In [None]:
# Statistical summary
print("Statistical Summary:")
data.describe()

## 2. Power Generation Analysis

In [None]:
# Power generation patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Power distribution
axes[0, 0].hist(data['Power_W'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Power Generation Distribution')
axes[0, 0].set_xlabel('Power (W)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(True, alpha=0.3)

# Power vs Time (if Time column exists)
if 'Time' in data.columns:
    sample_data = data.sample(min(1000, len(data)))
    axes[0, 1].plot(sample_data['Time'], sample_data['Power_W'], alpha=0.7)
    axes[0, 1].set_title('Power Generation Over Time (Sample)')
    axes[0, 1].set_xlabel('Time')
    axes[0, 1].set_ylabel('Power (W)')
    axes[0, 1].grid(True, alpha=0.3)
    axes[0, 1].tick_params(axis='x', rotation=45)

# Hourly patterns (if Hour column exists)
if 'Hour' in data.columns:
    hourly_avg = data.groupby('Hour')['Power_W'].mean()
    axes[1, 0].bar(hourly_avg.index, hourly_avg.values)
    axes[1, 0].set_title('Average Power by Hour of Day')
    axes[1, 0].set_xlabel('Hour')
    axes[1, 0].set_ylabel('Average Power (W)')
    axes[1, 0].grid(True, alpha=0.3)

# Box plot of power by hour
if 'Hour' in data.columns:
    sample_hours = data[data['Hour'].isin([6, 9, 12, 15, 18])]
    sns.boxplot(data=sample_hours, x='Hour', y='Power_W', ax=axes[1, 1])
    axes[1, 1].set_title('Power Distribution by Hour')
    axes[1, 1].set_xlabel('Hour')
    axes[1, 1].set_ylabel('Power (W)')
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Weather Data Analysis

In [None]:
# Weather variables analysis
weather_cols = [col for col in data.columns if col in ['Irradiance', 'Temperature', 'RelativeHumidity', 'WindSpeed']]

if weather_cols:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, col in enumerate(weather_cols[:4]):
        if i < len(axes):
            axes[i].hist(data[col].dropna(), bins=30, alpha=0.7, edgecolor='black')
            axes[i].set_title(f'{col} Distribution')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for i in range(len(weather_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()
else:
    print("Weather columns not found in the dataset")

## 4. Correlation Analysis

In [None]:
# Correlation matrix
numeric_cols = data.select_dtypes(include=[np.number]).columns
correlation_matrix = data[numeric_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Show correlations with Power_W
if 'Power_W' in correlation_matrix.columns:
    power_correlations = correlation_matrix['Power_W'].sort_values(ascending=False)
    print("\nCorrelations with Power Generation:")
    print(power_correlations)

## 5. Key Relationships

In [None]:
# Scatter plots of key relationships
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Power vs Irradiance
if 'Irradiance' in data.columns:
    sample_data = data.sample(min(1000, len(data)))
    axes[0].scatter(sample_data['Irradiance'], sample_data['Power_W'], alpha=0.6)
    axes[0].set_xlabel('Irradiance (W/m²)')
    axes[0].set_ylabel('Power (W)')
    axes[0].set_title('Power vs Irradiance')
    axes[0].grid(True, alpha=0.3)

# Power vs Temperature
if 'Temperature' in data.columns:
    sample_data = data.sample(min(1000, len(data)))
    axes[1].scatter(sample_data['Temperature'], sample_data['Power_W'], alpha=0.6)
    axes[1].set_xlabel('Temperature (°C)')
    axes[1].set_ylabel('Power (W)')
    axes[1].set_title('Power vs Temperature')
    axes[1].grid(True, alpha=0.3)

# Power vs Hour
if 'Hour' in data.columns:
    sample_data = data.sample(min(1000, len(data)))
    axes[2].scatter(sample_data['Hour'], sample_data['Power_W'], alpha=0.6)
    axes[2].set_xlabel('Hour of Day')
    axes[2].set_ylabel('Power (W)')
    axes[2].set_title('Power vs Hour')
    axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Summary and Insights

In [None]:
print("DATA EXPLORATION SUMMARY")
print("=" * 50)
print(f"Dataset shape: {data.shape}")
print(f"Number of features: {len(data.columns)}")
print(f"Missing values: {data.isnull().sum().sum()}")

if 'Power_W' in data.columns:
    print(f"\nPower Generation Statistics:")
    print(f"  Mean: {data['Power_W'].mean():.2f} W")
    print(f"  Std: {data['Power_W'].std():.2f} W")
    print(f"  Min: {data['Power_W'].min():.2f} W")
    print(f"  Max: {data['Power_W'].max():.2f} W")

print("\nKey Insights:")
print("- Solar power generation shows clear daily and seasonal patterns")
print("- Irradiance is the strongest predictor of power generation")
print("- Temperature has a moderate positive correlation with power")
print("- Time-based features capture important cyclical patterns")
print("\nNext steps: Data preprocessing and feature engineering")

## Next Steps

1. **Data Preprocessing**: Clean data, handle missing values, engineer features
2. **Feature Engineering**: Create lag features, time-based features, and derived metrics
3. **Model Training**: Train multiple machine learning models
4. **Model Evaluation**: Compare performance and select best model

Continue to the next notebook: `02_data_preprocessing.ipynb`