# Solar Power Prediction - Data Preprocessing

This notebook handles data cleaning, feature engineering, and preparation for machine learning.

## Objectives
- Load and merge multiple data sources
- Handle missing values and outliers
- Engineer time-based and lag features
- Prepare data for modeling

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import json
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Data Loading and Integration

In a full implementation, you would load multiple CSV files. For this demo, we'll work with the processed sample.

In [None]:
def load_sample_data():
    """Load or create sample data for demonstration"""
    try:
        # Try to load existing processed data
        data = pd.read_csv('../data/processed_solar_sample.csv')
        print(f"Loaded existing data: {data.shape}")
        return data
    except FileNotFoundError:
        print("Creating sample dataset for demonstration...")
        
        # Create realistic sample data
        np.random.seed(42)
        n_samples = 5000
        
        # Generate time series
        dates = pd.date_range('2023-01-01', periods=n_samples, freq='H')
        
        # Generate realistic solar patterns
        hours = dates.hour
        days = dates.dayofyear
        
        # Solar irradiance with daily and seasonal patterns
        irradiance = np.maximum(0, 
            800 * np.sin(np.pi * (hours - 6) / 12) * 
            np.sin(np.pi * days / 365) + 
            np.random.normal(0, 100, n_samples)
        )
        
        # Temperature with seasonal variation
        temperature = 20 + 10 * np.sin(2 * np.pi * days / 365) + np.random.normal(0, 3, n_samples)
        
        # Power generation based on irradiance with some noise
        power = np.maximum(0, irradiance * 3 + temperature * 10 + np.random.normal(0, 200, n_samples))
        
        # Create DataFrame
        data = pd.DataFrame({
            'Time': dates,
            'Power_W': power,
            'Irradiance': irradiance,
            'Temperature': temperature,
            'RelativeHumidity': np.random.normal(60, 15, n_samples),
            'WindSpeed': np.random.exponential(5, n_samples),
            'Station': np.random.choice(['Station_A', 'Station_B', 'Station_C'], n_samples)
        })
        
        return data

# Load data
data = load_sample_data()
print(f"Dataset shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
data.head()

## 2. Data Quality Assessment

In [None]:
# Check data quality
print("DATA QUALITY ASSESSMENT")
print("=" * 40)

# Missing values
missing_values = data.isnull().sum()
print(f"Missing values:")
for col, missing in missing_values.items():
    if missing > 0:
        print(f"  {col}: {missing} ({missing/len(data)*100:.1f}%)")
    
if missing_values.sum() == 0:
    print("  No missing values found!")

# Data types
print(f"\nData types:")
print(data.dtypes)

# Basic statistics
print(f"\nBasic statistics:")
data.describe()

## 3. Outlier Detection and Handling

In [None]:
def detect_outliers_iqr(df, column):
    """Detect outliers using IQR method"""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Check for outliers in key columns
numeric_cols = data.select_dtypes(include=[np.number]).columns
outlier_summary = {}

for col in numeric_cols:
    outliers, lower, upper = detect_outliers_iqr(data, col)
    outlier_summary[col] = {
        'count': len(outliers),
        'percentage': len(outliers) / len(data) * 100,
        'bounds': (lower, upper)
    }

print("OUTLIER ANALYSIS")
print("=" * 40)
for col, info in outlier_summary.items():
    print(f"{col}: {info['count']} outliers ({info['percentage']:.1f}%)")

# Visualize outliers for Power_W
if 'Power_W' in data.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Box plot
    axes[0].boxplot(data['Power_W'].dropna())
    axes[0].set_title('Power Generation - Box Plot')
    axes[0].set_ylabel('Power (W)')
    axes[0].grid(True, alpha=0.3)
    
    # Histogram
    axes[1].hist(data['Power_W'].dropna(), bins=50, alpha=0.7, edgecolor='black')
    axes[1].set_title('Power Generation - Distribution')
    axes[1].set_xlabel('Power (W)')
    axes[1].set_ylabel('Frequency')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 4. Feature Engineering

In [None]:
def create_time_features(df, time_col='Time'):
    """Create time-based features"""
    df = df.copy()
    
    # Ensure time column is datetime
    if time_col in df.columns:
        df[time_col] = pd.to_datetime(df[time_col])
        
        # Basic time features
        df['Hour'] = df[time_col].dt.hour
        df['Day'] = df[time_col].dt.day
        df['Month'] = df[time_col].dt.month
        df['DayOfWeek'] = df[time_col].dt.dayofweek
        df['DayOfYear'] = df[time_col].dt.dayofyear
        df['Weekend'] = (df['DayOfWeek'] >= 5).astype(int)
        
        # Cyclical encoding
        df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
        df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)
        df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
        df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
        df['DayOfYear_sin'] = np.sin(2 * np.pi * df['DayOfYear'] / 365)
        df['DayOfYear_cos'] = np.cos(2 * np.pi * df['DayOfYear'] / 365)
        
        # Season
        df['Season'] = df['Month'].map({
            12: 'Winter', 1: 'Winter', 2: 'Winter',
            3: 'Spring', 4: 'Spring', 5: 'Spring',
            6: 'Summer', 7: 'Summer', 8: 'Summer',
            9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
        })
        
        # Simple solar elevation (approximation)
        df['SolarElevation'] = np.maximum(0, 
            np.sin(np.pi * (df['Hour'] - 6) / 12) * 
            np.sin(2 * np.pi * df['DayOfYear'] / 365)
        )
    
    return df

# Create time features
data_with_time = create_time_features(data)
print(f"Added time features. New shape: {data_with_time.shape}")
print(f"New columns: {[col for col in data_with_time.columns if col not in data.columns]}")

In [None]:
def create_lag_features(df, target_col='Power_W', lags=[1, 24]):
    """Create lag features for time series"""
    df = df.copy()
    
    if target_col in df.columns:
        for lag in lags:
            df[f'{target_col}_lag_{lag}'] = df[target_col].shift(lag)
        
        # Rolling statistics
        df[f'{target_col}_rolling_mean_6'] = df[target_col].rolling(window=6, min_periods=1).mean()
        df[f'{target_col}_rolling_std_6'] = df[target_col].rolling(window=6, min_periods=1).std()
    
    return df

# Create lag features
data_with_lags = create_lag_features(data_with_time)
print(f"Added lag features. New shape: {data_with_lags.shape}")
print(f"Lag columns: {[col for col in data_with_lags.columns if 'lag' in col or 'rolling' in col]}")

In [None]:
def create_engineered_features(df):
    """Create domain-specific engineered features"""
    df = df.copy()
    
    # Power density (efficiency metric)
    if 'Power_W' in df.columns and 'Irradiance' in df.columns:
        df['Power_Density'] = df['Power_W'] / (df['Irradiance'] + 1e-6)
    
    # Temperature efficiency factor
    if 'Temperature' in df.columns:
        df['Temp_Efficiency'] = 1 - 0.004 * (df['Temperature'] - 25)  # Typical solar panel temp coefficient
    
    # Clear sky index (if we had theoretical max irradiance)
    if 'Irradiance' in df.columns and 'SolarElevation' in df.columns:
        theoretical_max = 1000 * df['SolarElevation']  # Simplified
        df['Clear_Sky_Index'] = df['Irradiance'] / (theoretical_max + 1e-6)
    
    # Station encoding (if categorical)
    if 'Station' in df.columns:
        station_mapping = {station: i for i, station in enumerate(df['Station'].unique())}
        df['Station_encoded'] = df['Station'].map(station_mapping)
    
    return df

# Create engineered features
data_engineered = create_engineered_features(data_with_lags)
print(f"Added engineered features. Final shape: {data_engineered.shape}")
print(f"Engineered columns: {[col for col in data_engineered.columns if col in ['Power_Density', 'Temp_Efficiency', 'Clear_Sky_Index', 'Station_encoded']]}")

## 5. Data Cleaning and Preparation

In [None]:
# Handle missing values (created by lag features)
print("HANDLING MISSING VALUES")
print("=" * 40)

missing_before = data_engineered.isnull().sum().sum()
print(f"Missing values before cleaning: {missing_before}")

# Drop rows with missing lag features (first few rows)
data_clean = data_engineered.dropna()

missing_after = data_clean.isnull().sum().sum()
print(f"Missing values after cleaning: {missing_after}")
print(f"Rows removed: {len(data_engineered) - len(data_clean)}")
print(f"Final dataset shape: {data_clean.shape}")

In [None]:
# Remove extreme outliers (optional)
def remove_outliers(df, columns, method='iqr', factor=1.5):
    """Remove outliers using IQR method"""
    df_clean = df.copy()
    
    for col in columns:
        if col in df_clean.columns:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - factor * IQR
            upper_bound = Q3 + factor * IQR
            
            before_count = len(df_clean)
            df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
            after_count = len(df_clean)
            
            print(f"{col}: Removed {before_count - after_count} outliers")
    
    return df_clean

# Remove outliers from key columns
outlier_columns = ['Power_W', 'Irradiance', 'Temperature']
data_final = remove_outliers(data_clean, outlier_columns, factor=2.0)  # More conservative

print(f"\nFinal dataset shape after outlier removal: {data_final.shape}")

## 6. Feature Selection and Preparation

In [None]:
# Define features and target
target_column = 'Power_W'
exclude_columns = ['Time', 'Station', 'Season']  # Non-numeric or identifier columns

# Select feature columns
feature_columns = [col for col in data_final.columns 
                  if col != target_column and col not in exclude_columns]

print(f"Target variable: {target_column}")
print(f"Number of features: {len(feature_columns)}")
print(f"Features: {feature_columns}")

# Prepare feature matrix and target vector
X = data_final[feature_columns]
y = data_final[target_column]

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training target: {y_train.shape}")
print(f"Test target: {y_test.shape}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Feature scaling completed!")
print(f"Scaled training features shape: {X_train_scaled.shape}")
print(f"Scaled test features shape: {X_test_scaled.shape}")

## 7. Save Processed Data

In [None]:
# Save processed dataset
processed_data = data_final.copy()
processed_data.to_csv('../data/processed_solar_data.csv', index=False)

# Save feature information
feature_info = {
    'target': target_column,
    'features': feature_columns,
    'categorical': ['Station_encoded'],  # Categorical features that were encoded
    'numerical': [col for col in feature_columns if col != 'Station_encoded'],
    'n_samples': len(processed_data),
    'n_features': len(feature_columns)
}

with open('../data/feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)

# Save train/test splits
train_data = pd.concat([X_train_scaled, y_train], axis=1)
test_data = pd.concat([X_test_scaled, y_test], axis=1)

train_data.to_csv('../data/train_data.csv', index=False)
test_data.to_csv('../data/test_data.csv', index=False)

print("Data saved successfully!")
print(f"Files created:")
print(f"  - ../data/processed_solar_data.csv")
print(f"  - ../data/feature_info.json")
print(f"  - ../data/train_data.csv")
print(f"  - ../data/test_data.csv")

## 8. Data Summary

In [None]:
print("DATA PREPROCESSING SUMMARY")
print("=" * 50)
print(f"Original dataset shape: {data.shape}")
print(f"Final dataset shape: {processed_data.shape}")
print(f"Features created: {len(feature_columns)}")
print(f"Target variable: {target_column}")

print(f"\nFeature categories:")
print(f"  - Time features: {len([col for col in feature_columns if any(x in col for x in ['Hour', 'Day', 'Month', 'sin', 'cos', 'Solar'])])}")
print(f"  - Weather features: {len([col for col in feature_columns if any(x in col for x in ['Irradiance', 'Temperature', 'Humidity', 'Wind'])])}")
print(f"  - Lag features: {len([col for col in feature_columns if 'lag' in col or 'rolling' in col])}")
print(f"  - Engineered features: {len([col for col in feature_columns if any(x in col for x in ['Density', 'Efficiency', 'Index', 'encoded'])])}")

print(f"\nData splits:")
print(f"  - Training: {len(X_train)} samples ({len(X_train)/len(processed_data)*100:.1f}%)")
print(f"  - Testing: {len(X_test)} samples ({len(X_test)/len(processed_data)*100:.1f}%)")

print(f"\nTarget variable statistics:")
print(f"  - Mean: {y.mean():.2f}")
print(f"  - Std: {y.std():.2f}")
print(f"  - Min: {y.min():.2f}")
print(f"  - Max: {y.max():.2f}")

print("\nData is ready for model training!")

## Next Steps

The data is now preprocessed and ready for machine learning. The next steps are:

1. **Model Training**: Train multiple algorithms (Random Forest, Gradient Boosting, etc.)
2. **Model Evaluation**: Compare performance using appropriate metrics
3. **Hyperparameter Tuning**: Optimize the best performing models
4. **Feature Importance**: Analyze which features are most predictive

Continue to the next notebook: `03_model_training.ipynb`