In [92]:
import numpy as np
import pandas as pd
import os

#### MetaData

In [93]:
# read and merge csv files (Weather)
read_weather = True
# process and save processed csv files (Weather)
process_weather = True
station_name = 'Brooks'


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# GENERATION
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# read and merge csv files (Generation)
read_generation = True
# read and merge csv files (Generation)
save_process_generation = True
asset_name = "BSC1 Brooks Solar"


In [94]:
def clean_df(file_path: os.PathLike):
    df = pd.read_csv(file_path, encoding='unicode_escape')  # Read CSV file with proper encoding
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Remove any "Unnamed" columns that might be auto-generated
    return df  # Return the cleaned DataFrame

## Weather Data

#### Read csv files and merge them

In [95]:
# Read CSVs and drop any unnamed index column
if read_weather:
    # Create a new folder to save the merged CSV file
    os.makedirs('./Data/Merged', exist_ok=True)
    # Load cleaned CSVs (each file contains data for a specific time period)
    file_path = f"./Data/ACIS/{station_name}"
    all_weather_data = pd.concat([clean_df(f'{file_path}/{file}') for file in os.listdir(f'{file_path}')], ignore_index=True)
    # Save the merged dataframe as a new CSV file without index values
    all_weather_data.to_csv(f"./Data/Merged/{station_name}_weather_data.csv", index=False)  # `index=False` prevents writing index column to CSV
    print('Merged CSV file saved successfully!')
    print(f"Merged data has {all_weather_data.shape[0]} rows.")

Merged CSV file saved successfully!
Merged data has 43843 rows.


In [122]:
def process_weather_station_data(df: pd.DataFrame, save: bool = False, columns = ['Date', 'Air Temp. Inst. (Â°C)', 'Humidity Inst. (%)', 'Incoming Solar Rad. (W/m2)','Precip. (mm)', 'Wind Speed 10 m Syno. (km/h)', 'Wind Dir. 10 m Syno. (Â°)', 'Wind Speed 10 m Avg. (km/h)', 'Wind Dir. 10 m Avg. (Â°)',]) -> pd.DataFrame:
    """
    Get data for a specific asset.
    """
    try:
        print(df.columns)
        df['Date'] = pd.to_datetime(df['Date (Local Standard Time)'], errors='coerce')        
        columns_to_drop = [column for column in df.columns if column not in columns]
        df.drop(columns=columns_to_drop, axis=1, inplace=True)
        print(df.isna().sum())
        df.fillna(method='ffill', inplace=True)
        if save:
            df.to_csv(f'./Data/Merged/{station_name}_weather.csv', index=False)
            print('Processed CSV file saved successfully!')
        print("Miissing values filled")
        print(f"Data for {station_name} has {df.shape[0]} rows.")
        return df
    except Exception as e:
        print(f"No data found for {station_name}. Error: {e}")

#### Process Data

In [123]:
# Load the merged CSV file
weather_data = clean_df(f"./Data/Merged/{station_name}_weather_data.csv")
if process_weather:
    station_data = process_weather_station_data(weather_data, save=True)
    # Convert Data
    print(station_data)
    # Create a new folder to save the merged CSV file
    os.makedirs('./Data/Merged', exist_ok=True)
    # Fill missing values with the average of previous and next values
    # print(station_data.isna().sum())
    # print(f"Processed data has {station_data.shape[0]} rows.")

Index(['Station Name', 'Date (Local Standard Time)', 'Air Temp. Inst. (Â°C)',
       'Air Temp. Inst. Source Flag', 'Air Temp. Inst. Comment',
       'Humidity Inst. (%)', 'Humidity Inst. Source Flag',
       'Humidity Inst. Comment', 'Incoming Solar Rad. (W/m2)',
       'Incoming Solar Rad. Source Flag', 'Incoming Solar Rad. Comment',
       'Precip. (mm)', 'Precip. Source Flag', 'Precip. Comment',
       'Wind Speed 10 m Syno. (km/h)', 'Wind Speed 10 m Syno. Source Flag',
       'Wind Speed 10 m Syno. Comment', 'Wind Dir. 10 m Syno. (Â°)',
       'Wind Dir. 10 m Syno. Source Flag', 'Wind Dir. 10 m Syno. Comment',
       'Wind Speed 10 m Avg. (km/h)', 'Wind Speed 10 m Avg. Source Flag',
       'Wind Speed 10 m Avg. Comment', 'Wind Dir. 10 m Avg. (Â°)',
       'Wind Dir. 10 m Avg. Source Flag', 'Wind Dir. 10 m Avg. Comment'],
      dtype='object')
Air Temp. Inst. (Â°C)              0
Humidity Inst. (%)                 0
Incoming Solar Rad. (W/m2)         0
Precip. (mm)                 

  df.fillna(method='ffill', inplace=True)


Processed CSV file saved successfully!
Miissing values filled
Data for Brooks has 43843 rows.
       Air Temp. Inst. (Â°C)  Humidity Inst. (%)  Incoming Solar Rad. (W/m2)  \
0                       -1.3                83.0                         0.0   
1                       -0.6                79.0                         0.0   
2                       -1.9                82.0                         0.0   
3                       -2.3                86.0                         0.0   
4                       -1.8                83.0                         0.0   
...                      ...                 ...                         ...   
43838                  -11.4                89.0                         0.0   
43839                  -10.5                88.0                         0.0   
43840                  -10.6                87.0                         0.0   
43841                  -10.8                86.0                         0.0   
43842                  -11

## Generation Data

#### Read csv files and merge them

In [98]:
if read_generation:
    # Load data using a for loop and a list comprehension
    generation_data = pd.concat([clean_df(f'./Data/CSD/{file}') for file in os.listdir('./Data/CSD')], ignore_index=True)
    # Create a new folder to save the merged CSV file
    os.makedirs('./Data/Merged', exist_ok=True)
    # Save the merged dataframe as a new CSV file without index values
    generation_data.to_csv('./Data/Merged/Generation.csv', index=False)  # `index=False` prevents writing index column to CSV
    print('Merged CSV file saved successfully!')
    print(f"Merged data has {generation_data.shape[0]} rows.")

Merged CSV file saved successfully!
Merged data has 7252077 rows.


#### Data Processing

In [99]:
# Find assets with most data in the generation data
def get_assets_with_most_data(df: pd.DataFrame, fuel_type: str) -> pd.Series:
    """
    Get the assets with the most data points in the generation data.
    """
    try:
        df = df[df['Fuel Type'] == fuel_type.upper()]
        asset_list = df['Asset Name'].value_counts()
        most_valuable_asset = asset_list.where(asset_list == asset_list[0]).dropna()
        print(f"Total number of assets: {len(asset_list)}")
        print(f"{len(most_valuable_asset)} assets have the most data. They have {most_valuable_asset[0]} data points.")
        return most_valuable_asset
    except Exception as e:
        print(f"No data found for {fuel_type} assets.")

In [100]:
# Get data for a specific asset.
def get_asset_data(df: pd.DataFrame, asset_name: str, save: bool = False, columns = ['Volume', 'Maximum Capability', 'System Capability']) -> pd.DataFrame:
    """
    Get data for a specific asset.
    """
    try:
        asset_data = df[df['Asset Name'] == asset_name]
        columns_to_drop = [column for column in asset_data.columns if column not in columns]
        asset_data['Date'] = pd.to_datetime(asset_data['Date (MST)'], errors='coerce')
        asset_data.drop(columns=columns_to_drop, axis=1, inplace=True)
        print(asset_data.isna().sum())
        asset_data.fillna(method='ffill', inplace=True)
        if save:
            asset_data.to_csv(f'./Data/Merged/{asset_name}_generation.csv', index=False)
            print('Processed CSV file saved successfully!')
        print("Miissing values filled")
        print(f"Data for {asset_name} has {asset_data.shape[0]} rows.")
        return asset_data
    except Exception as e:
        print(f"No data found for {asset_name}.")

In [125]:
# Read merged data
generation_data = clean_df('./Data/Merged/Generation.csv')

In [102]:
# get_assets_with_most_data(generation_data,'wind')

In [126]:
asset_gen_data = get_asset_data(generation_data, asset_name, save_process_generation)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asset_data['Date'] = pd.to_datetime(asset_data['Date (MST)'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asset_data.drop(columns=columns_to_drop, axis=1, inplace=True)
  asset_data.fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asset_data.fillna(method='ffill', inplace=True)


Volume                0
Maximum Capability    0
System Capability     0
Date                  0
dtype: int64
Processed CSV file saved successfully!
Miissing values filled
Data for BSC1 Brooks Solar has 43848 rows.


## Merge Data

In [127]:
merged_df = pd.merge(asset_gen_data, weather_data, on='Date', how='inner')
merged_rows = merged_df.shape[0]
print(f"The merged file has {merged_rows} rows.")
print(merged_df.isna().sum())
merged_df.to_csv(f"./Data/Merged/{asset_name}_Processed_and_Data.csv", index=False)  # `index=False` prevents writing index column to CSV

The merged file has 43843 rows.
Volume                          0
Maximum Capability              0
System Capability               0
Date                            0
Air Temp. Inst. (Â°C)           0
Humidity Inst. (%)              0
Incoming Solar Rad. (W/m2)      0
Precip. (mm)                    0
Wind Speed 10 m Syno. (km/h)    0
Wind Dir. 10 m Syno. (Â°)       0
Wind Speed 10 m Avg. (km/h)     0
Wind Dir. 10 m Avg. (Â°)        0
dtype: int64
