# Environment Setting Preparation

In [None]:
!pip install git+https://github.com/Quantmetry/qolmat

# Importing Packages

In [111]:
from qolmat.benchmark import comparator, missing_patterns
from qolmat.imputations import imputers
from qolmat.utils import plot



In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import scienceplots


import sklearn
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



import missingno
import warnings
import gc

# Global Configuration Setting Controling Randomness, Trials, etc

In [113]:
sklearn.set_config(transform_output="pandas")
np.seterr(under='ignore')
warnings.filterwarnings('ignore')
SEED = 42
n_trials = 50

# Read the Data

In [115]:
# reading the train dataset
df = pd.read_csv("X_train.csv",
                 dayfirst=True,
                 parse_dates=True,
                 index_col="Date")


In [None]:
df.columns

In [None]:
df.shape

# Missing Value Imputation

In [118]:
scaler = MinMaxScaler()

# Fit the scaler on the training data
df_scaling =scaler.fit(df)
df_scaled = df_scaling.transform(df)

In [None]:
# UHG
median_imputer = imputers.ImputerSimple(strategy="median")
mean_imputer = imputers.ImputerSimple(strategy="mean")
knn_imputer = imputers.KNNImputer(n_neighbors=3)
em_imputer = imputers.ImputerEM(model="VAR", method="mle",max_iter_em=500,
                                n_iter_ou=20, dt=1e-3, p=1, random_state=SEED)
inter_imputer = imputers.ImputerInterpolation(method="linear")
LOCF_imputer = imputers.ImputerLOCF()
# Time interpolation and TSA decomposition
TSA_imputor = imputers.ImputerResiduals(period=365, model_tsa="additive")



dict_imputers = {
    "Mean": mean_imputer,
    "Median": median_imputer,
    "K-nearest neighbors": knn_imputer,
    "EM sampler": em_imputer,
    "interpolation": inter_imputer,
    "LOCF": LOCF_imputer,
    "TSA": TSA_imputor
  }

generator_holes = missing_patterns.UniformHoleGenerator(
    n_splits=10,
    subset=df_scaled.columns,
    ratio_masked=0.25)


comparison = comparator.Comparator(
      dict_imputers,
      df_scaled.columns,
      generator_holes = generator_holes,
      # metrics = ["mae", "wmape", "KL_columnwise", "energy"],
      metrics = ["mae"],
  )

results = comparison.compare(df_scaled)


In [121]:
results.to_csv("X_train_imputation_results.csv")


In [122]:
em1_imputer_columns = ['DS of Influent Primary Sludge (%)']
knn1_imputer_columns = ['VS of Influent Primary Sludge (%)']

em2_imputer_columns = ['DS of Influent Waste Sludge (%)',
                       'VS of Influent Waste Sludge (%)']

knn2_imputer_columns = ['DS in Digesters (%)']

em3_imputer_columns = ['DS of effluent Sludge (%)',
                       'VS of effluent Sludge (%)']

knn3_imputer_columns = ['Alkalinity (mg CaCO3/L)',
                        'Fatty Acid (mg/L)',
                        'pH',
                        'Temperature (Degrees Celsius)']

em4_imputer_columns = ['Influent Primary to Waste Sludge flowrate Ratio']


knn4_imputer_columns = ['Influent Primary Sludge flowrate (m3/d)']

em5_imputer_columns = ['Influent Waste Sludge flowrate (m3/d)',
                       'Total Effluent Sludge flowrate (m3/d)']



median_imputer = imputers.ImputerSimple(strategy="median")
mean_imputer = imputers.ImputerSimple(strategy="mean")
knn_imputer = imputers.KNNImputer(n_neighbors=5)
em_imputer = imputers.ImputerEM(model="VAR", method="mle",max_iter_em=50,
                                n_iter_ou=15, dt=1e-3, p=1, random_state=SEED)
inter_imputer = imputers.ImputerInterpolation(method="linear")
LOCF_imputer = imputers.ImputerLOCF()
TSA_imputor = imputers.ImputerResiduals(period=365, model_tsa="additive")



transformers_for_imputing = [("em1_imputer", em_imputer, em1_imputer_columns),
                             ("knn1_imputer_columns", knn_imputer, knn1_imputer_columns),
                             ("em2_imputer", em_imputer, em2_imputer_columns),
                             ("knn2_imputer_columns", knn_imputer, knn2_imputer_columns),
                             ("em3_imputer", em_imputer, em3_imputer_columns),
                             ("knn3_imputer_columns", knn_imputer, knn3_imputer_columns),
                             ("em4_imputer", em_imputer, em4_imputer_columns),
                             ("knn4_imputer_columns", knn_imputer, knn4_imputer_columns),
                             ("em5_imputer", em_imputer, em5_imputer_columns),
                             ]

column_imputers = ColumnTransformer(transformers_for_imputing).fit(df)


df_filled_na = column_imputers.transform(df)


In [None]:
print(df_filled_na.isnull().sum())


In [124]:
df = df_filled_na

In [125]:
df.to_csv("XtrainImputed.csv")


# Outlier Reconstruction

In [None]:
# --- Outlier Removal Using Z-Score Method for Train Data ---
from scipy.interpolate import interp1d
# Calculate z-scores to identify outliers
z_scores = (df - df.mean()) / df.std()

# Set a z-score threshold for identifying outliers
z_threshold = 3

# Identify outliers
outliers = (np.abs(z_scores) > z_threshold).any(axis=1)

# Create an array of indices
indices = np.arange(len(df))

# Create an interpolation function for each column
interp_funcs = {}
for column in df.columns:
    interp_funcs[column] = interp1d(indices[~outliers], df.loc[~outliers, column], kind='linear', fill_value='extrapolate')

# Replace outliers with interpolated values for each column
data_interp = pd.DataFrame({column: interp_funcs[column](indices) for column in df.columns})

data_interp.index = df.index

df_ro=data_interp

df_ro_sum = df_ro.describe().round(2)
df_ro_sum.transpose()

# Count the number of outliers detected
num_outliers = np.sum(outliers)
print(f"Number of outliers detected: {num_outliers}")


In [127]:
X_train_Imputed_ROutlier = df_ro

# Saving the Imputed and Outlier Reduced Train Data

In [128]:
X_train_Imputed_ROutlier.to_csv("X_train_Imputed_ROutlier.CSV")

# Preprocessing of Test Data

In [130]:
# reading the test dataset
df_test = pd.read_csv("X_test.csv",
                 dayfirst=True,
                 parse_dates=True,
                 index_col="Date")

In [131]:
df_filled_na_test = column_imputers.transform(df_test)


In [None]:
print(df_filled_na_test.isnull().sum())


In [133]:
df_filled_na_test.to_csv("XtestImputed.csv")


In [None]:
# --- Outlier Removal Using Z-Score Method for Test Data ---
from scipy.interpolate import interp1d
# Calculate z-scores to identify outliers
# mean and standard deviation from training data
z_scores = (df_filled_na_test - df.mean()) / df.std()

# Set a z-score threshold for identifying outliers
z_threshold = 3

# Identify outliers
outliers = (np.abs(z_scores) > z_threshold).any(axis=1)

# Create an array of indices
indices = np.arange(len(df_filled_na_test))

# Create an interpolation function for each column
interp_funcs = {}
for column in df_filled_na_test.columns:
    interp_funcs[column] = interp1d(indices[~outliers], df_filled_na_test.loc[~outliers, column], kind='linear', fill_value='extrapolate')

# Replace outliers with interpolated values for each column
data_interp = pd.DataFrame({column: interp_funcs[column](indices) for column in df_filled_na_test.columns})

data_interp.index = df_filled_na_test.index

df_ro=data_interp

df_ro_sum = df_ro.describe().round(2)
df_ro_sum.transpose()


# Count the number of outliers detected
num_outliers = np.sum(outliers)
print(f"Number of outliers detected: {num_outliers}")


In [135]:
X_test_Imputed_ROutlier = df_ro

In [136]:
X_test_Imputed_ROutlier.to_csv("X_test_Imputed_ROutlier.CSV")