### Import libraries

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler

### load options

In [None]:
with open("options.txt", 'r') as f:
    options = f.readlines()
    options = {option.split("=")[0]: option.split("=")[1].strip() for option in options}
print(options)

### Load csv files from scenario from directory

In [None]:
scenario_dir = options["hanoi_scenario_dir"]
# print(scenario_dir)
# Read each CSV file in dir "path/to/root_dir"

def read_files_dataframe(scenario_dir):
    dfs = []
    for subfolder in ["Demands", "Flows", "Pressures"]:
        for file in Path(scenario_dir).glob(f"{subfolder}/*.csv"):
            dfs.append(pd.read_csv(file, index_col=0, header=0, names=["Index", f"{subfolder}_{file.stem}"]))
    return pd.concat(dfs, axis=1)

# Put the dataframes to a single dataframe
# df = pd.concat(dfs, axis=1)
# print(dfs)
df = read_files_dataframe(scenario_dir)
df

In [None]:
def read_files_scaled(scenario_dir):
    dfs = []
    for subfolder in ["Demands", "Flows", "Pressures"]:
        dfs_temp = []
        for file in Path(scenario_dir).glob(f"{subfolder}/*.csv"):
            dfs_temp.append(pd.read_csv(file, index_col=0, header=0, names=["Index", f"{subfolder}_{file.stem}"]))
        dfs_temp_trans = pd.concat(dfs_temp, axis=1)

        scaler = StandardScaler()
        scaler.fit(dfs_temp_trans)
        dfs.append(scaler.transform(dfs_temp_trans))

    return np.concatenate(dfs, axis=1)
data_scaled = read_files_scaled(scenario_dir)
data_scaled

In [None]:
# df.Demands_Node_1.iloc[:1000].plot()
ax = df.Demands_Node_1.iloc[:1000].plot()
df.Demands_Node_1.iloc[:1000].ewm(span=12).mean().plot(ax=ax)

### Check for any NaN values

In [None]:
# for i in dfs:
#     if i.isna().any():
#         display(i)

In [None]:
df.isna().any().value_counts()


In [None]:
pca = PCA(30)
pca_features = pca.fit_transform(df)
print('Shape before PCA: ', df.shape)
print('Shape after PCA: ', pca_features.shape)

In [None]:
for i in sorted(list(pca.explained_variance_), reverse=True)[:10]:
    print(i)

In [None]:
plt.bar(
    range(1,len(pca.explained_variance_)+1),
    pca.explained_variance_
    )
 
plt.xlabel('PCA Feature')
plt.ylabel('Explained variance')
plt.title('Feature Explained Variance')
plt.show()