# Importing Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scienceplots


import sklearn
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from scipy import stats

import tensorflow as tf
import keras_tuner


import missingno
import warnings
import gc

# Plotting Configuration

In [2]:
plt.rcdefaults()
mpl_global_config = {
    'figure.figsize': (7, 7),
    'figure.dpi': 1000,
    'font.size': 16,
    'axes.labelsize': 14,
    'axes.titlesize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 8,
    'lines.linewidth': 2,
    'lines.markersize': 3,
    'grid.linewidth': 0.75,
    'savefig.dpi': 1000,
    'savefig.transparent': False,
    'savefig.bbox': 'tight',
    'pdf.compression': 9,
    'axes.axisbelow': True
}
plt.rcParams.update(mpl_global_config)
plt.style.use(['science', 'nature', 'high-contrast', "no-latex"])


colors = {
    "yellow": "#DDAA33",
    "red": "#BB5566",
    "blue": "#004488",
    "black": "#000000",
    "white": "#FFFFFF"
}

# Global Configuration Setting Controling Randomness, Trials, etc

In [3]:
sklearn.set_config(transform_output="pandas")
np.seterr(under='ignore')
warnings.filterwarnings('ignore')
SEED = 42
tf.keras.utils.set_random_seed(SEED)
n_trials = 50

# Read the Original (Raw) Data

In [None]:
# reading the dataset

Originaldf = pd.read_csv("OriginalData.csv",
                 dayfirst=True,
                 parse_dates=True,
                 index_col="Date")


TARGET = "Total Biogas Flowrate"
print(f"shape after reading data: {Originaldf.shape}")

In [None]:
Originaldf.head()

In [None]:
Originaldf.columns

# Missing Values

In [None]:
missing_values = Originaldf.isnull().sum()
missing_values.plot(kind='bar', figsize=(8, 6))

# Set font size for title and labels
plt.title('Missing Values by Column', fontsize=16)
plt.xlabel('Column', fontsize=10)
plt.ylabel('Number of Missing Values', fontsize=14)

# Increase font size for x-ticks and y-ticks
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)

# Save the plot as an SVG file
plt.savefig("Missing Data.svg", format='svg', dpi=1000, bbox_inches='tight')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

# Shapiro-Wilk and Kolmogorov-Smirnov Tests

In [None]:
# Create a list to store the results
results = []

# Iterate through each column in the DataFrame
for column in Originaldf.columns:
    # Drop NaN values for analysis
    data = Originaldf[column].dropna()
    
    # Skip if there's not enough data
    if len(data) < 3:
        results.append([column, None, None, None, None, "Not enough data"])
        continue

    # Calculate the mean value for the current column
    mean_value = data.mean()

    # 3. Statistical Tests
    # Shapiro-Wilk Test
    shapiro_stat, shapiro_p = stats.shapiro(data)

    # Kolmogorov-Smirnov Test
    ks_stat, ks_p = stats.kstest(data, 'norm', args=(mean_value, np.std(data)))

    # Interpretation of p-values
    alpha = 0.05
    shapiro_result = "data distribution is not Gaussian" if shapiro_p < alpha else "data distribution may be Gaussian"
    ks_result = "data distribution is not Gaussian" if ks_p < alpha else "data distribution is Gaussian"

    # Append results to the list
    results.append([column, shapiro_stat, shapiro_p, ks_stat, ks_p, shapiro_result, ks_result])

# Create a DataFrame from the results
results_df = pd.DataFrame(results, columns=['Column', 'Shapiro Statistic', 'Shapiro p-value', 'K-S Statistic', 'K-S p-value', 'Shapiro Result', 'K-S Result'])

# Save the results to a CSV file
results_df.to_csv('statistical_tests_results.csv', index=False)

print("Results saved to statistical_tests_results.csv")

# Visualize Original Data

In [None]:
# Visualize Original Data 
num_variables = len(Originaldf.columns)  # Number of variables to plot
num_rows = (num_variables + 1) // 2  # Calculate the number of rows needed

fig, axes = plt.subplots(num_rows, 2, figsize=(13, 3 * num_rows))  # Adjust the size as needed, now with 2 columns

for i, column in enumerate(Originaldf.columns):
    ax = axes[i // 2, i % 2]  # Determine the correct subplot
    ax.plot(Originaldf[column])
    ax.set_title(column)
    ax.set_xlabel('Date')
    ax.set_ylabel(column)
    ax.tick_params(axis='x', rotation=45)  # Rotate x-axis labels by 45 degrees

# Adjust spacing between plots
plt.subplots_adjust(hspace=0.5)  # Increase vertical space between rows
plt.savefig("Original Data.svg", format='svg', dpi=1000, bbox_inches='tight')
plt.tight_layout()
plt.show()

# Spliting Dataset for Preprocessing

In [None]:
Originaldf.columns

In [11]:
Originaldf = Originaldf.dropna(subset=["Total Biogas Flowrate (m3/d)"])


In [None]:
Originaldf.shape

In [None]:
# Drop Highly Correlated Features regarding Heatmaps
X = Originaldf.drop(columns=["Total Biogas Flowrate (m3/d)"])

y = Originaldf.pop("Total Biogas Flowrate (m3/d)")
print(f"dataframe shape: {Originaldf.shape}\n"\
      f"features shape: {X.shape}\n"\
      f"target shape: {y.shape}")

In [None]:
# seperating train samples from test samples
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.70,
    random_state=SEED
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [15]:
X_train.to_csv("X_train.csv")
X_test.to_csv("X_test.csv")
y.to_csv("y.csv")
