In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import os


In [10]:
def import_solar_data(file_path):
  # Check if file exists
  if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file {file_path} does not exist")
  
  # Read the CSV file
  print(f"Importing data from {file_path}...")
  df = pd.read_csv(file_path)
  
  column_mapping = {
    'Serial number': 'Serial_Number',
    'Time': 'Timestamp',
    'Pac(W)': 'AC_Power',
    'Ppv(W)': 'DC_Power',
    'VacR(V)': 'Voltage_AC_R',
    'VacS(V)': 'Voltage_AC_S',
    'VacT(V)': 'Voltage_AC_T',
    'VacRS(V)': 'Voltage_AC_RS',
    'VacST(V)': 'Voltage_AC_ST',
    'VacTR(V)': 'Voltage_AC_TR',
    'IacR(A)': 'Current_AC_R',
    'IacS(A)': 'Current_AC_S',
    'IacT(A)': 'Current_AC_T',
    'Fac(Hz)': 'Frequency'
  }

  # Rename columns using the existing column_mapping
  for src, dst in column_mapping.items():
    if src in df.columns:
      df[dst] = df[src]
  
  # Calculate average Voltage_AC and Current_AC if 3-phase data is available
  if all(x in df.columns for x in ['Voltage_AC_R', 'Voltage_AC_S', 'Voltage_AC_T']):
    df['Voltage_AC'] = df[['Voltage_AC_R', 'Voltage_AC_S', 'Voltage_AC_T']].mean(axis=1)
  
  if all(x in df.columns for x in ['Current_AC_R', 'Current_AC_S', 'Current_AC_T']):
    df['Current_AC'] = df[['Current_AC_R', 'Current_AC_S', 'Current_AC_T']].mean(axis=1)
  
  # Calculate efficiency if not directly provided
  if 'Efficiency' not in df.columns and 'AC_Power' in df.columns and 'DC_Power' in df.columns:
    df['Efficiency'] = df.apply(
      lambda row: (row['AC_Power'] / row['DC_Power'] * 100) if row['DC_Power'] > 0 else 0, 
      axis=1
    )
  
  # Check for expected columns
  # Define expected columns based on key metrics needed for analysis
  expected_columns = [
    'Serial_Number', 'Timestamp', 'AC_Power', 'DC_Power',
    'Voltage_AC_R', 'Voltage_AC_S', 'Voltage_AC_T',
    'Voltage_AC_RS', 'Voltage_AC_ST', 'Voltage_AC_TR',
    'Current_AC_R', 'Current_AC_S', 'Current_AC_T',
    'Voltage_AC', 'Current_AC', 'Frequency', 'Efficiency'
  ]
  
  missing_columns = [col for col in expected_columns if col not in df.columns]
  if missing_columns:
    print(f"Warning: Missing expected columns: {missing_columns}")
  
  # Convert timestamp to datetime
  if 'Timestamp' in df.columns:
    try:
      df['Timestamp'] = pd.to_datetime(df['Timestamp'])
      print(f"Timestamp range: {df['Timestamp'].min()} to {df['Timestamp'].max()}")
    except Exception as e:
      print(f"Error converting timestamp: {e}")
      # Try alternative formats if standard conversion fails
      try:
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')
        print(f"Timestamp range: {df['Timestamp'].min()} to {df['Timestamp'].max()}")
      except Exception as e2:
        print(f"Error with alternative timestamp format: {e2}")
  
  # Process status flags (convert to binary indicators of errors)
  if 'Status_Flags' in df.columns:
    df['Has_Error'] = df['Status_Flags'].apply(lambda x: 0 if x == 0 or pd.isna(x) else 1)
  
  # Handle missing values
  numeric_cols = ['AC_Power', 'DC_Power', 'Voltage_AC', 'Current_AC', 'Frequency', 'Efficiency']
  available_cols = [col for col in numeric_cols if col in df.columns]
  
  for col in available_cols:
    missing = df[col].isna().sum()
    if missing > 0:
      print(f"Column {col} has {missing} missing values")
      # For critical columns, interpolate; for others, fill with median
      if col in ['AC_Power', 'DC_Power']:
        df[col] = df[col].interpolate(method='linear').fillna(method='bfill').fillna(method='ffill')
      else:
        df[col] = df[col].fillna(df[col].median())
  
  # Check for outliers in AC_Power (our target variable)
  if 'AC_Power' in df.columns:
    q1 = df['AC_Power'].quantile(0.25)
    q3 = df['AC_Power'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 3 * iqr
    upper_bound = q3 + 3 * iqr
    
    outliers = df[(df['AC_Power'] < lower_bound) | (df['AC_Power'] > upper_bound)]
    if not outliers.empty:
      print(f"Found {len(outliers)} potential outliers in AC_Power")
  
  print(f"Successfully imported data with {df.shape[0]} rows and {df.shape[1]} columns")
  return df

In [None]:
def import_multiple_inverter_data(folder_path):

  # Check if folder exists
  if not os.path.exists(folder_path):
    raise FileNotFoundError(f"The folder {folder_path} does not exist")
  
  # List all CSV files in the folder
  csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
  if not csv_files:
    raise FileNotFoundError(f"No CSV files found in the folder {folder_path}")
  
  combined_df = pd.DataFrame()
  print(f"Successfully combined data from {len(csv_files)} files")
  return combined_df

folder_path = "./inverters/"
combined_data = import_multiple_inverter_data(folder_path)

Successfully combined data from 12 files
Combined data saved to ./inverters/combined_inverter_data.csv
Serial_Number column not found in the combined data
