# Preparing the data for classification

**Author:** Manuela Linke & Tobias Meßmer, HTWG Konstanz 

**Date:** 15.03.2024 

**Summary:** This Script reads in simulated grid data and converts the results into standardized, one-hot-encoded classes. The data gets split into train (70%), test (15%) and validation data (15%).

In [1]:
import numpy as np
import keras
from keras.models import Sequential
import tensorflow as tf
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.utils import to_categorical
import pandas as pd 
import os
import math
from collections import Counter
import pandapower as pp

# Select modus and define seed for random state

In [16]:
# Select which data should be considered as input data: load, volt or loadANDvolt
modus = 'volt'

# Set seed for random state
seed = 42

## Read Data

In [17]:
#define path
net = pp.from_pickle("cossmic_grid.p")
input_path = os.path.join('data', 'preprocessed', '')
date = '2024-03-25_16-56-21' # Replace this with the actual date in format yyyy-mm-dd_hh-mm-ss'

#Load data
X_load = np.load(input_path + date + "_Load_distribution.npy")
X_volt = np.load(input_path + date + "_voltage_distribution.npy")
y_changes = np.load(input_path + date + "__min_chg_from_base.npy")

## Take a look at the data

In [18]:
X_load.shape

(52558, 11)

In [19]:
X_load[:5]

array([[ 1.63e-01,  1.60e-03,  2.56e-01,  6.70e-03,  2.80e-04,  5.20e-03,
         5.40e-04, -3.70e-03, -3.50e-03, -2.00e-02,  1.92e-01],
       [ 1.63e-01,  1.12e-02,  7.68e-01,  6.70e-03,  2.80e-04,  1.56e-02,
         1.74e-02,  3.20e-04,  3.60e-04,  2.70e-03,  1.92e-01],
       [ 4.89e-01,  1.12e-02,  7.97e-02,  2.01e-02,  2.80e-04,  5.20e-03,
         5.80e-03,  3.20e-04,  3.50e-03,  2.70e-03,  5.76e-01],
       [ 4.89e-01,  1.12e-02,  2.56e-01,  2.01e-02,  2.80e-04,  4.40e-04,
         1.74e-02,  3.70e-03,  3.60e-04,  2.00e-02,  1.92e-01],
       [ 1.63e-01, -1.12e-02,  2.56e-01,  5.80e-04,  8.10e-03, -5.20e-03,
        -5.80e-03,  3.70e-03,  3.50e-03,  2.00e-02,  1.92e-01]])

In [20]:
# set X-data to variable X_data (X_load, X_volt or X_load and X_volt)
if modus == 'load':
    X_data = X_load
    
if modus == 'volt':
    X_data = X_volt
    
if modus == 'loadANDvolt':
    X_data = np.append(X_load, X_volt, axis=1)
    
# set y-data to variable y_data_matrix (changes)
y_data_matrix = y_changes

print("Lets take a look at the solutions for the " + str(len(X_data)) + " Problems:")

indikator = Counter(y_data_matrix[:,-1])
trafo2 = Counter(y_data_matrix[:,-2])
trafo1 = Counter(y_data_matrix[:,-3])

print("Not solvable (0) / Solvable (1) Problems: " + str(indikator))
print("Used steps for Transformer station 1: " + str(trafo1))
print("Used steps for Transformer station 1: " + str(trafo2))

Lets take a look at the solutions for the 52558 Problems:
Not solvable (0) / Solvable (1) Problems: Counter({0: 34720, 1: 17838})
Used steps for Transformer station 1: Counter({-1: 37176, 0: 14283, -2: 1099})
Used steps for Transformer station 1: Counter({-1: 29385, -2: 18247, 0: 4926})


## Prepare y-data
### One hot encoding of the tranformer tap positions and converting y-data to a vector

In [21]:
num_switches = len(net.switch[net.switch.et == 'l'])

# number of transformer stations
num_trafos = len(net.trafo)

# defining the colums belongig to the transformer stations
trafo_columns_start = num_switches
trafo_columns_end = num_switches + num_trafos
trafo_columns_indices = list(range(trafo_columns_start, trafo_columns_end))

In [22]:
tap_pos_ranges = [range(row['tap_min'], row['tap_max']+1) for index, row in net.trafo.iterrows()]
categories = [list(tap_range) for tap_range in tap_pos_ranges]

# Initialisation of the OneHotEncoder
encoder = OneHotEncoder(categories=categories)

# Encoding the transformer tap changer position columns to one hot format
encoded_columns = encoder.fit_transform(y_data_matrix[:, trafo_columns_indices]).toarray()

# Remove the old columns and add the encoded columns
y_data_matrix_extended = np.hstack((y_data_matrix[:, :trafo_columns_start], encoded_columns, y_data_matrix[:, trafo_columns_end:]))

In [23]:
# Converting Boolean values to integers and calculating decimal values
y_data = np.dot(y_data_matrix_extended.astype(int), 2**np.arange(y_data_matrix_extended.shape[1])[::-1])

## Prepare for Classification: Changing the large decimal values to smaller numbers

In [24]:
# Find unique values in y_data and assign them to a new index
unique_values, y_data_result = np.unique(y_data, return_inverse=True)
print('Amount of y_data for training, validation and test:', len(y_data_result))
print('Number of occuring results:', max(y_data_result))

Amount of y_data for training, validation and test: 52558
Number of occuring results: 34


## shuffle and split data to trainings, validation and test data

In [25]:
#shuffle and split 
X_train, X_tobesplittet, y_train, y_tobesplittet = train_test_split(X_data, y_data_result, test_size=0.3, random_state=seed)
X_val, X_test, y_val, y_test = train_test_split(X_tobesplittet, y_tobesplittet, test_size=0.5, random_state=seed)
print('Große des Datensatzes: ' + str(len(X_train)) + ' (Training) + ' + str(len(X_val)) + ' (Validation) + ' + str(len(X_test)) + ' (Test) = ' + str(len(X_train)+len(X_val)+len(X_test)))

Große des Datensatzes: 36790 (Training) + 7884 (Validation) + 7884 (Test) = 52558


## Standardize X-data 

In [26]:
# Using StandardScaler 
sc = StandardScaler()

# to be tested if necessary .. 
if modus == 'loadANDvolt':
    # Adapt the scaler to the training data and its transformation
    X_train_std_ = sc.fit_transform(X_train[:,:X_load.shape[1]])
    # Transformation of the test data with the same parameters as the training data
    X_val_std_ = sc.transform(X_val[:,:X_load.shape[1]])
    X_test_std_ = sc.transform(X_test[:,:X_load.shape[1]])
    # appending voltage values without standardisation
    X_train_std = np.append(X_train_std_, X_train[:,X_load.shape[1]:],1)
    X_val_std = np.append(X_val_std_, X_val[:,X_load.shape[1]:],1)
    X_test_std = np.append(X_test_std_, X_test[:,X_load.shape[1]:],1)

else:
    # Adapt the scaler to the training data and its transformation
    X_train_std = sc.fit_transform(X_train)
    # Transformation of the test data with the same parameters as the training data
    X_val_std = sc.transform(X_val)
    X_test_std = sc.transform(X_test)

## Classify y-data

In [27]:
# Turning y data into one hot encoding
Y_train = to_categorical(y_train, max(y_data_result) + 1)
Y_val = to_categorical(y_val, max(y_data_result) + 1)
Y_test = to_categorical(y_test, max(y_data_result) + 1)

## Save data

In [28]:
import os

# Define the path you want to ensure exists
savePath = os.path.join('data', 'prepared', '')

# Check if the path exists
if not os.path.exists(savePath):
    # If it doesn't exist, create it
    os.makedirs(savePath)
    print(f"Path '{savePath}' was created.")
else:
    print(f"Path '{savePath}' already exists.")

Path 'data\prepared\' already exists.


In [29]:
np.save(savePath + date + "_X_train_std_" + modus + ".npy", X_train_std)
np.save(savePath + date + "_X_val_std_" + modus + ".npy", X_val_std)
np.save(savePath + date + "_X_test_std_" + modus + ".npy", X_test_std)

np.save(savePath + date + "_Y_train.npy", Y_train) 
np.save(savePath + date + "_Y_val.npy", Y_val)
np.save(savePath + date + "_Y_test.npy", Y_test)

In [21]:
# When making sure to use the rigth columns in the grid, its better to have the names of the loads stored. Feather-format is right for this:
# converting to df to store in feather format
colum_names = net.load.name
if modus != 'loadANDvolt':
    X_train_std_df = pd.DataFrame(X_train_std, columns = colum_names)
    X_train_std_df.to_feather(savePath + date + "_X_train_std_" + modus + ".feather")
    X_val_std_df = pd.DataFrame(X_val_std, columns = colum_names)
    X_val_std_df.to_feather(savePath + date + "_X_val_std_" + modus + ".feather")
    X_test_std_df = pd.DataFrame(X_test_std, columns = colum_names)
    X_test_std_df.to_feather(savePath + date + "_X_test_std_" + modus + ".feather")