# Solar Power Generation Data Exploration

This notebook explores the solar farm data to understand the structure, quality, and relationships in the dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Data directory
DATA_DIR = '/home/ubuntu/upload/'

## 1. Data File Inventory

In [None]:
# Get all data files
all_files = glob.glob(os.path.join(DATA_DIR, '*.csv'))
all_files.extend(glob.glob(os.path.join(DATA_DIR, '*.xlsx')))
all_files.extend(glob.glob(os.path.join(DATA_DIR, '*.ttl')))

print(f"Total files: {len(all_files)}")

# Categorize files
power_generation_files = [f for f in all_files if not any(weather in f for weather in 
                         ['Temperature', 'Humidity', 'Irradiance', 'Wind', 'Visibility', 
                          'SeaLevelPressure', 'RelativeHumidity', 'Rainfall']) 
                         and not 'Inverter' in f and f.endswith('.csv')]

inverter_files = [f for f in all_files if 'Inverter' in f]

weather_files = [f for f in all_files if any(weather in f for weather in 
                ['Temperature', 'Humidity', 'Irradiance', 'Wind', 'Visibility', 
                 'SeaLevelPressure', 'RelativeHumidity', 'Rainfall'])]

metadata_files = [f for f in all_files if f.endswith('.ttl')]

print(f"\nPower generation files: {len(power_generation_files)}")
print(f"Inverter files: {len(inverter_files)}")
print(f"Weather files: {len(weather_files)}")
print(f"Metadata files: {len(metadata_files)}")

## 2. Power Generation Data Analysis

In [None]:
# Sample a few power generation files to understand structure
sample_power_files = power_generation_files[:5]

for file in sample_power_files:
    print(f"\n=== {os.path.basename(file)} ===")
    df = pd.read_csv(file)
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Date range: {df['Time'].iloc[0]} to {df['Time'].iloc[-1]}")
    print(f"Sample data:")
    print(df.head(3))

In [None]:
# Load and combine all power generation data
def load_power_data(file_path):
    """Load power generation data with proper datetime parsing"""
    df = pd.read_csv(file_path)
    df['Time'] = pd.to_datetime(df['Time'])
    df['station'] = os.path.basename(file_path).replace('.csv', '')
    return df

# Load first few files for analysis
power_data_list = []
for file in power_generation_files[:10]:  # Start with first 10 files
    try:
        df = load_power_data(file)
        power_data_list.append(df)
        print(f"Loaded {os.path.basename(file)}: {df.shape[0]} records")
    except Exception as e:
        print(f"Error loading {file}: {e}")

# Combine data
if power_data_list:
    combined_power_data = pd.concat(power_data_list, ignore_index=True)
    print(f"\nCombined power data shape: {combined_power_data.shape}")
    print(f"Date range: {combined_power_data['Time'].min()} to {combined_power_data['Time'].max()}")
    print(f"Unique stations: {combined_power_data['station'].nunique()}")

## 3. Weather Data Analysis

In [None]:
# Analyze weather data structure
weather_data_dict = {}

for file in weather_files:
    weather_type = os.path.basename(file).split('_')[0]
    year = os.path.basename(file).split('_')[1].replace('.csv', '').replace('.xlsx', '')
    
    try:
        if file.endswith('.xlsx'):
            df = pd.read_excel(file)
        else:
            df = pd.read_csv(file)
        
        df['Time'] = pd.to_datetime(df['Time'])
        
        key = f"{weather_type}_{year}"
        weather_data_dict[key] = df
        
        print(f"{key}: {df.shape[0]} records, columns: {list(df.columns)}")
        
    except Exception as e:
        print(f"Error loading {file}: {e}")

print(f"\nLoaded {len(weather_data_dict)} weather datasets")

In [None]:
# Combine weather data by type
weather_types = ['Temperature', 'Irradiance', 'RelativeHumidity', 'Wind', 'Rainfall', 'SeaLevelPressure', 'Visibility']
combined_weather = {}

for weather_type in weather_types:
    type_data = []
    for key, df in weather_data_dict.items():
        if weather_type in key:
            type_data.append(df)
    
    if type_data:
        combined_df = pd.concat(type_data, ignore_index=True)
        combined_df = combined_df.sort_values('Time').reset_index(drop=True)
        combined_weather[weather_type] = combined_df
        print(f"{weather_type}: {combined_df.shape[0]} records from {combined_df['Time'].min()} to {combined_df['Time'].max()}")

## 4. Data Quality Assessment

In [None]:
# Check for missing values in power data
if 'combined_power_data' in locals():
    print("=== Power Data Quality ===")
    print(f"Missing values:")
    print(combined_power_data.isnull().sum())
    
    print(f"\nData types:")
    print(combined_power_data.dtypes)
    
    print(f"\nBasic statistics:")
    print(combined_power_data.describe())

In [None]:
# Check weather data quality
print("=== Weather Data Quality ===")
for weather_type, df in combined_weather.items():
    print(f"\n{weather_type}:")
    print(f"  Missing values: {df.isnull().sum().sum()}")
    print(f"  Date range: {df['Time'].min()} to {df['Time'].max()}")
    print(f"  Records: {len(df)}")
    
    # Check for duplicates
    duplicates = df.duplicated(subset=['Time']).sum()
    print(f"  Duplicate timestamps: {duplicates}")

## 5. Data Visualization

In [None]:
# Plot power generation patterns
if 'combined_power_data' in locals():
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Sample one station for detailed analysis
    sample_station = combined_power_data['station'].iloc[0]
    station_data = combined_power_data[combined_power_data['station'] == sample_station].copy()
    
    # Daily pattern
    station_data['hour'] = station_data['Time'].dt.hour
    hourly_avg = station_data.groupby('hour')['power(W)'].mean()
    axes[0,0].plot(hourly_avg.index, hourly_avg.values)
    axes[0,0].set_title(f'Average Hourly Power Generation - {sample_station}')
    axes[0,0].set_xlabel('Hour of Day')
    axes[0,0].set_ylabel('Power (W)')
    
    # Time series
    sample_week = station_data.head(672)  # One week of 15-min data
    axes[0,1].plot(sample_week['Time'], sample_week['power(W)'])
    axes[0,1].set_title('Power Generation Time Series (Sample Week)')
    axes[0,1].set_xlabel('Time')
    axes[0,1].set_ylabel('Power (W)')
    axes[0,1].tick_params(axis='x', rotation=45)
    
    # Power distribution
    axes[1,0].hist(station_data['power(W)'], bins=50, alpha=0.7)
    axes[1,0].set_title('Power Generation Distribution')
    axes[1,0].set_xlabel('Power (W)')
    axes[1,0].set_ylabel('Frequency')
    
    # Monthly pattern
    station_data['month'] = station_data['Time'].dt.month
    monthly_avg = station_data.groupby('month')['power(W)'].mean()
    axes[1,1].bar(monthly_avg.index, monthly_avg.values)
    axes[1,1].set_title('Average Monthly Power Generation')
    axes[1,1].set_xlabel('Month')
    axes[1,1].set_ylabel('Power (W)')
    
    plt.tight_layout()
    plt.savefig('/home/ubuntu/power_generation_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# Plot weather patterns
if combined_weather:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    
    for i, (weather_type, df) in enumerate(combined_weather.items()):
        if i >= 6:  # Only plot first 6 weather types
            break
            
        # Sample data for plotting (every 60th point to reduce density)
        sample_df = df.iloc[::60].copy()
        
        column_name = [col for col in df.columns if col != 'Time'][0]
        
        axes[i].plot(sample_df['Time'], sample_df[column_name], alpha=0.7)
        axes[i].set_title(f'{weather_type} Over Time')
        axes[i].set_xlabel('Time')
        axes[i].set_ylabel(column_name)
        axes[i].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig('/home/ubuntu/weather_patterns.png', dpi=300, bbox_inches='tight')
    plt.show()

## 6. Correlation Analysis

In [None]:
# Create a merged dataset for correlation analysis
if 'combined_power_data' in locals() and combined_weather:
    # Sample one station
    sample_station_data = combined_power_data[combined_power_data['station'] == sample_station].copy()
    
    # Resample weather data to 15-minute intervals to match power data
    merged_data = sample_station_data[['Time', 'power(W)']].copy()
    
    for weather_type, weather_df in combined_weather.items():
        # Resample to 15-minute intervals
        weather_df_resampled = weather_df.set_index('Time').resample('15T').mean().reset_index()
        
        # Merge with power data
        column_name = [col for col in weather_df.columns if col != 'Time'][0]
        weather_df_resampled = weather_df_resampled[['Time', column_name]]
        weather_df_resampled.columns = ['Time', weather_type]
        
        merged_data = pd.merge(merged_data, weather_df_resampled, on='Time', how='left')
    
    print(f"Merged dataset shape: {merged_data.shape}")
    print(f"Missing values in merged data:")
    print(merged_data.isnull().sum())
    
    # Calculate correlations
    correlation_matrix = merged_data.select_dtypes(include=[np.number]).corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5)
    plt.title('Correlation Matrix: Power Generation vs Weather Variables')
    plt.tight_layout()
    plt.savefig('/home/ubuntu/correlation_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Print correlations with power generation
    power_correlations = correlation_matrix['power(W)'].sort_values(ascending=False)
    print("\nCorrelations with Power Generation:")
    print(power_correlations)

## 7. Data Summary and Insights

In [None]:
print("=== DATA EXPLORATION SUMMARY ===")
print(f"\n1. Dataset Overview:")
print(f"   - Power generation files: {len(power_generation_files)}")
print(f"   - Weather data types: {len(combined_weather)}")
print(f"   - Time range: 2021-2023")
print(f"   - Power data frequency: 15 minutes")
print(f"   - Weather data frequency: 1 minute")

if 'combined_power_data' in locals():
    print(f"\n2. Power Generation Data:")
    print(f"   - Total records: {len(combined_power_data):,}")
    print(f"   - Unique stations: {combined_power_data['station'].nunique()}")
    print(f"   - Average power: {combined_power_data['power(W)'].mean():.2f} W")
    print(f"   - Max power: {combined_power_data['power(W)'].max():.2f} W")

if combined_weather:
    print(f"\n3. Weather Data:")
    for weather_type, df in combined_weather.items():
        column_name = [col for col in df.columns if col != 'Time'][0]
        print(f"   - {weather_type}: {len(df):,} records, avg: {df[column_name].mean():.2f}")

if 'correlation_matrix' in locals():
    print(f"\n4. Key Correlations with Power Generation:")
    top_correlations = power_correlations.drop('power(W)').head(3)
    for var, corr in top_correlations.items():
        print(f"   - {var}: {corr:.3f}")

print(f"\n5. Data Quality Issues:")
print(f"   - Weather data needs resampling to match power data frequency")
print(f"   - Some missing values in weather data")
print(f"   - Need to handle timezone and daylight patterns")
print(f"   - Multiple stations need aggregation strategy")