In [None]:
# Plot each numeric column from aew1 raw and clean data, then aew2 raw and clean data for comparison

# Install plotly and nbformat if not already installed
!pip install plotly nbformat --upgrade

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# Try importing plotly, fallback to matplotlib if it fails
try:
    import plotly.express as px
    import plotly.graph_objects as go
    plotly_available = True
except ImportError:
    print("Plotly not available. Falling back to matplotlib plots only.")
    plotly_available = False

# Define processed data directory
processed_dir = "../Processed_Data"

# Define datasets to plot
datasets = [
    ('aew1', [
        ('raw', os.path.join(processed_dir, 'aew1_raw_timeseries.csv')),
        ('clean', os.path.join(processed_dir, 'aew1_clean_timeseries.csv'))
    ]),
    ('aew2', [
        ('raw', os.path.join(processed_dir, 'aew2_raw_timeseries.csv')),
        ('clean', os.path.join(processed_dir, 'aew2_clean_timeseries.csv'))
    ])
]

# Function to plot a single column from raw and clean data
def plot_single_column(raw_df, clean_df, dataset_name, time_col, col):
    if (raw_df.empty or clean_df.empty or
        time_col not in raw_df.columns or time_col not in clean_df.columns or
        col not in raw_df.columns or col not in clean_df.columns):
        print(f"Cannot plot {col} for {dataset_name}: Missing time column or data in raw/clean.")
        return
    
    # Ensure time column is datetime
    raw_df[time_col] = pd.to_datetime(raw_df[time_col], errors='coerce')
    clean_df[time_col] = pd.to_datetime(clean_df[time_col], errors='coerce')
    raw_df = raw_df.dropna(subset=[time_col])
    clean_df = clean_df.dropna(subset=[time_col])
    
    # Replace placeholders (1313 for E13, 1414 for E14) with np.nan
    raw_data = raw_df[col].replace([1313, 1414], np.nan)
    clean_data = clean_df[col].replace([1313, 1414], np.nan)
    
    # Matplotlib static plot
    plt.figure(figsize=(12, 6))
    plt.plot(raw_df[time_col], raw_data, label=f'{dataset_name}_raw - {col}', color='blue', alpha=0.5)
    plt.plot(clean_df[time_col], clean_data, label=f'{dataset_name}_clean - {col}', color='orange', linestyle='--')
    plt.title(f'{dataset_name} - {col} (Raw vs Clean)')
    plt.xlabel('Time')
    plt.ylabel(col)
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Plotly interactive plot (if available)
    if plotly_available:
        try:
            fig = go.Figure()
            fig.add_trace(go.Scatter(x=raw_df[time_col], y=raw_data, name=f'{dataset_name}_raw - {col}', line=dict(color='blue', width=2)))
            fig.add_trace(go.Scatter(x=clean_df[time_col], y=clean_data, name=f'{dataset_name}_clean - {col}', line=dict(color='orange', width=2, dash='dash')))
            fig.update_layout(
                title=f'Interactive {dataset_name} - {col} (Raw vs Clean)',
                xaxis_title='Time',
                yaxis_title=col,
                legend=dict(x=0, y=1),
                hovermode='x unified'
            )
            fig.show()
        except Exception as e:
            print(f"Failed to create Plotly plot for {dataset_name} ({col}): {e}. Using matplotlib only.")

# Process each dataset
for dataset_name, (raw_info, clean_info) in datasets:
    raw_label, raw_file = raw_info
    clean_label, clean_file = clean_info
    
    # Check if files exist
    if not os.path.exists(raw_file):
        print(f"{raw_file} not found. Skipping {dataset_name}_raw...")
        raw_df = pd.DataFrame()
    else:
        raw_df = pd.read_csv(raw_file)
    
    if not os.path.exists(clean_file):
        print(f"{clean_file} not found. Skipping {dataset_name}_clean...")
        clean_df = pd.DataFrame()
    else:
        clean_df = pd.read_csv(clean_file)
    
    if raw_df.empty or clean_df.empty:
        print(f"Skipping {dataset_name} due to missing raw or clean data.")
        continue
    
    print(f"\nPlotting data for {dataset_name} (raw vs clean)...")
    
    # Identify common numeric columns
    numeric_cols = [col for col in raw_df.columns if col in clean_df.columns and
                    col != 'time' and raw_df[col].dtype in ['float64', 'int64'] and
                    clean_df[col].dtype in ['float64', 'int64']]
    
    # Plot each numeric column
    for col in numeric_cols:
        plot_single_column(raw_df, clean_df, dataset_name, 'time', col)


In [None]:
# Plot each numeric column from energy_overview_raw_timeseries and energy_overview_clean_timeseries for comparison

# Install plotly and nbformat if not already installed
!pip install plotly nbformat --upgrade

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# Try importing plotly, fallback to matplotlib if it fails
try:
    import plotly.express as px
    import plotly.graph_objects as go
    plotly_available = True
except ImportError:
    print("Plotly not available. Falling back to matplotlib plots only.")
    plotly_available = False

# Define processed data directory
processed_dir = "../Processed_Data"

# Define datasets to plot
datasets = [
    ('energy_overview', [
        ('raw', os.path.join(processed_dir, 'energy_overview_raw_timeseries.csv')),
        ('clean', os.path.join(processed_dir, 'energy_overview_clean_timeseries.csv'))
    ])
]

# Function to plot a single column from raw and clean data
def plot_single_column(raw_df, clean_df, dataset_name, time_col, col):
    if (raw_df.empty or clean_df.empty or
        time_col not in raw_df.columns or time_col not in clean_df.columns or
        col not in raw_df.columns or col not in clean_df.columns):
        print(f"Cannot plot {col} for {dataset_name}: Missing time column or data in raw/clean.")
        return
    
    # Ensure time column is datetime
    raw_df[time_col] = pd.to_datetime(raw_df[time_col], errors='coerce')
    clean_df[time_col] = pd.to_datetime(clean_df[time_col], errors='coerce')
    raw_df = raw_df.dropna(subset=[time_col])
    clean_df = clean_df.dropna(subset=[time_col])
    
    # Convert data to numeric, replace NaNs with np.nan for plotting
    raw_data = pd.to_numeric(raw_df[col], errors='coerce')
    clean_data = pd.to_numeric(clean_df[col], errors='coerce')
    
    # Matplotlib static plot
    plt.figure(figsize=(12, 6))
    plt.plot(raw_df[time_col], raw_data, label=f'{dataset_name}_raw - {col}', color='blue', alpha=0.5)
    plt.plot(clean_df[time_col], clean_data, label=f'{dataset_name}_clean - {col}', color='orange', linestyle='--')
    plt.title(f'{dataset_name} - {col} (Raw vs Clean)')
    plt.xlabel('Time')
    plt.ylabel(col)
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Plotly interactive plot (if available)
    if plotly_available:
        try:
            fig = go.Figure()
            fig.add_trace(go.Scatter(x=raw_df[time_col], y=raw_data, name=f'{dataset_name}_raw - {col}', line=dict(color='blue', width=2)))
            fig.add_trace(go.Scatter(x=clean_df[time_col], y=clean_data, name=f'{dataset_name}_clean - {col}', line=dict(color='orange', width=2, dash='dash')))
            fig.update_layout(
                title=f'Interactive {dataset_name} - {col} (Raw vs Clean)',
                xaxis_title='Time',
                yaxis_title=col,
                legend=dict(x=0, y=1),
                hovermode='x unified'
            )
            fig.show()
        except Exception as e:
            print(f"Failed to create Plotly plot for {dataset_name} ({col}): {e}. Using matplotlib only.")

# Process each dataset
for dataset_name, (raw_info, clean_info) in datasets:
    raw_label, raw_file = raw_info
    clean_label, clean_file = clean_info
    
    # Check if files exist
    if not os.path.exists(raw_file):
        print(f"{raw_file} not found. Skipping {dataset_name}_raw...")
        raw_df = pd.DataFrame()
    else:
        raw_df = pd.read_csv(raw_file, low_memory=False)
    
    if not os.path.exists(clean_file):
        print(f"{clean_file} not found. Skipping {dataset_name}_clean...")
        clean_df = pd.DataFrame()
    else:
        clean_df = pd.read_csv(clean_file, low_memory=False)
    
    if raw_df.empty or clean_df.empty:
        print(f"Skipping {dataset_name} due to missing raw or clean data.")
        continue
    
    print(f"\nPlotting data for {dataset_name} (raw vs clean)...")
    
    # Identify common numeric columns, excluding Timestamp and Year
    numeric_cols = [col for col in raw_df.columns if col in clean_df.columns and
                    col not in ['Timestamp', 'Year'] and
                    raw_df[col].dtype in ['float64', 'int64'] and
                    clean_df[col].dtype in ['float64', 'int64']]
    
    # Plot each numeric column
    for col in numeric_cols:
        plot_single_column(raw_df, clean_df, dataset_name, 'Timestamp', col)


In [None]:
# Plot each numeric column from PRL_SRL_TRL_Ergebnis raw and clean timeseries for comparison

# Install plotly and nbformat if not already installed
!pip install plotly nbformat --upgrade

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# Try importing plotly, fallback to matplotlib if it fails
try:
    import plotly.express as px
    import plotly.graph_objects as go
    plotly_available = True
except ImportError:
    print("Plotly not available. Falling back to matplotlib plots only.")
    plotly_available = False

# Define processed data directory
processed_dir = "../Processed_Data"

# Define datasets to plot
dataset_name = "PRL_SRL_TRL_Ergebnis"
raw_file = os.path.join(processed_dir, 'PRL_SRL_TRL_Ergebnis_raw_timeseries.csv')
clean_file = os.path.join(processed_dir, 'PRL_SRL_TRL_Ergebnis_clean_timeseries.csv')

# Function to plot a single column from raw or clean data
def plot_single_column(df, dataset_name, time_col, col, data_type, color, linestyle):
    if df.empty or time_col not in df.columns or col not in df.columns:
        print(f"Cannot plot {col} for {dataset_name}_{data_type}: Missing time column or data.")
        return
    
    # Ensure time column is treated as string (Ausschreibung)
    df[time_col] = df[time_col].astype(str)
    
    # Convert data to numeric
    data = pd.to_numeric(df[col], errors='coerce')
    
    # Matplotlib static plot
    plt.figure(figsize=(12, 6))
    plt.plot(df[time_col], data, label=f'{dataset_name}_{data_type} - {col}', color=color, linestyle=linestyle)
    plt.title(f'{dataset_name} - {col} ({data_type.capitalize()})')
    plt.xlabel('Ausschreibung')
    plt.ylabel(col)
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Plotly interactive plot (if available)
    if plotly_available:
        try:
            fig = go.Figure()
            fig.add_trace(go.Scatter(x=df[time_col], y=data, name=f'{dataset_name}_{data_type} - {col}', 
                                   line=dict(color=color, width=2, dash='solid' if linestyle == '-' else 'dash')))
            fig.update_layout(
                title=f'Interactive {dataset_name} - {col} ({data_type.capitalize()})',
                xaxis_title='Ausschreibung',
                yaxis_title=col,
                legend=dict(x=0, y=1),
                hovermode='x unified',
                xaxis=dict(tickangle=45)
            )
            fig.show()
        except Exception as e:
            print(f"Failed to create Plotly plot for {dataset_name}_{data_type} ({col}): {e}. Using matplotlib only.")

# Check if files exist
if not os.path.exists(raw_file):
    print(f"{raw_file} not found. Skipping {dataset_name}_raw...")
    raw_df = pd.DataFrame()
else:
    raw_df = pd.read_csv(raw_file, low_memory=False)

if not os.path.exists(clean_file):
    print(f"{clean_file} not found. Skipping {dataset_name}_clean...")
    clean_df = pd.DataFrame()
else:
    clean_df = pd.read_csv(clean_file, low_memory=False)

if raw_df.empty or clean_df.empty:
    print(f"Skipping {dataset_name} due to missing raw or clean data.")
else:
    print(f"\nPlotting data for {dataset_name} (raw then clean)...")
    
    # Identify common numeric columns (exclude 'Ausschreibung', 'Beschreibung', 'Year')
    text_cols = ['Ausschreibung', 'Beschreibung', 'Year']
    numeric_cols = [col for col in raw_df.columns if col in clean_df.columns and
                    col not in text_cols and
                    raw_df[col].dtype in ['float64', 'int64'] and
                    clean_df[col].dtype in ['float64', 'int64']]
    
    # Plot each numeric column (raw then clean)
    for col in numeric_cols:
        plot_single_column(raw_df, dataset_name, 'Ausschreibung', col, 'raw', 'blue', '-')
        plot_single_column(clean_df, dataset_name, 'Ausschreibung', col, 'clean', 'orange', '--')
