In [None]:
# !pip install spektral
# !pip install tensorflow_probability
# !pip uninstall -y tensorflow
# !pip install --user tensorflow==2.14.0


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# import tensorflow as tf
from spektral.layers import ChebConv, GCNConv
from tensorflow.keras.layers import LSTM, Dense, Input, Concatenate, TimeDistributed, Conv1D, LayerNormalization
from tensorflow.keras.models import Model

In [None]:
import sys
print(sys.version)

In [None]:
import pandas as pd
import os

def read_all_csv(folder_path):
    all_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    all_data = pd.DataFrame()

    for file in all_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        all_data = all_data.append(df, ignore_index=True)

    return all_data

# Replace 'your_folder_path' with the path to your folder containing CSV files
folder_path = '/metrics'
df = read_all_csv(folder_path)

In [None]:
df.columns,df.shape

In [None]:
df = df.drop(columns='Unnamed: 0')

In [None]:


scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['providerrpc_rt',
       'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr', 'writemc_rt',
       'writemc_mcr', 'readmc_rt', 'readmc_mcr', 'writedb_rt', 'writedb_mcr',
       'readdb_rt', 'readdb_mcr', 'consumermq_rt', 'consumermq_mcr',
       'providermq_rt', 'providermq_mcr', 'http_mcr', 'http_rt',
       'cpu_utilization', 'memory_utilization']])

In [None]:
len(df.columns)

In [None]:
class CPUUtilizationEstimator:
    def __init__(self, input_dim, hidden_dim):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.model = self.build_model()

    def build_model(self):
        # Input layer
        inputs = Input(shape=(self.input_dim,))

        # Shared fully-connected layers
        x = Dense(self.hidden_dim, activation='relu')(inputs)

        # Separate branches for mean and variance
        mean = Dense(1, activation='linear')(x)
        variance = Dense(1, activation='softplus')(x)  # softplus ensures variance is positive

        # MultivariateNormalDiag layer
        distribution_params = tf.keras.layers.Concatenate()([mean, variance])
        distribution = tfp.layers.DistributionLambda(
            make_distribution_fn=lambda t: tfd.MultivariateNormalDiag(
                loc=t[..., :1],
                scale_diag=tf.math.softplus(t[..., 1:]) + tf.keras.backend.epsilon()
            )
        )(distribution_params)

        return Model(inputs=inputs, outputs=distribution)

    def estimate(self, workload_metrics):
        # Estimate CPU utilization
        return self.model.predict(workload_metrics)

# Usage
input_dim = 20  # As per the feature embedding dimension mentioned in the paper
hidden_dim = 128  # This can be tuned

estimator = CPUUtilizationEstimator(input_dim, hidden_dim)
workload_metrics = np.array(scaled_features)  # Placeholder for workload metrics input
estimated_cpu_utilization = estimator.estimate(workload_metrics)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_fn = lambda y, model: -model.log_prob(y)
target = df['cpu_utilization']
X = scaler.fit_transform(df[['providerrpc_rt',
       'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr', 'writemc_rt',
       'writemc_mcr', 'readmc_rt', 'readmc_mcr', 'writedb_rt', 'writedb_mcr',
       'readdb_rt', 'readdb_mcr', 'consumermq_rt', 'consumermq_mcr',
       'providermq_rt', 'providermq_mcr', 'http_mcr', 'http_rt',
       'cpu_utilization', 'memory_utilization']])
y = target.values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train.astype(np.float32), y_train.astype(np.float32))).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val.astype(np.float32), y_val.astype(np.float32))).batch(32)

# Training Loop
epochs = 10  # Number of epochs
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    # Training
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            logits = estimator.model(x_batch_train, training=True)
            y_batch_train_reshaped = tf.reshape(y_batch_train, [-1, 1])

            loss_value = loss_fn(y_batch_train_reshaped, logits)
        grads = tape.gradient(loss_value, estimator.model.trainable_weights)
        optimizer.apply_gradients(zip(grads, estimator.model.trainable_weights))

        if step % 200 == 0:
            loss_value_mean = tf.reduce_mean(loss_value)
            print("Training loss (for one batch) at step %d: %.4f" % (step, loss_value_mean))


    # Validation
    val_loss = []
    for x_batch_val, y_batch_val in val_dataset:
        val_logits = estimator.model(x_batch_val, training=False)

        # Reshape y_batch_val to match val_logits shape
        y_batch_val_reshaped = tf.reshape(y_batch_val, [-1, 1])

        val_loss_value = loss_fn(y_batch_val_reshaped, val_logits)
        val_loss_value_mean = tf.reduce_mean(val_loss_value)
        val_loss.append(val_loss_value_mean)

    val_loss_mean = np.mean(val_loss)
    print("Validation loss: %.4f" % val_loss_mean)

In [None]:
# eval_df =
# eval_features = eval_df[['providerrpc_rt',
#        'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr', 'writemc_rt',
#        'writemc_mcr', 'readmc_rt', 'readmc_mcr', 'writedb_rt', 'writedb_mcr',
#        'readdb_rt', 'readdb_mcr', 'consumermq_rt', 'consumermq_mcr',
#        'providermq_rt', 'providermq_mcr', 'http_mcr', 'http_rt',
#        'cpu_utilization', 'memory_utilization']]
# eval_target = eval_df['cpu_utilization']

# scaler = StandardScaler()
# X_eval = scaler.fit_transform(eval_features)
# y_eval = eval_target.values
X_eval = X_val
y_eval = y_val
# If your model is saved, load it
# model = tf.keras.models.load_model('path_to_your_model')

# If the model is still in memory
# Use the 'estimator' object directly

# Convert to TensorFlow tensor and reshape target if needed
X_eval = tf.convert_to_tensor(X_eval, dtype=tf.float32)
y_eval_reshaped = tf.reshape(y_eval, [-1, 1])

# Make predictions
predictions = estimator.model(X_eval)
predicted_cpu_utilization = predictions.mean()

# Calculate evaluation metrics
mse = tf.keras.losses.MeanSquaredError()
mae = tf.keras.losses.MeanAbsoluteError()

mse_value = mse(y_eval_reshaped, predicted_cpu_utilization)
mae_value = mae(y_eval_reshaped, predicted_cpu_utilization)

print("Mean Squared Error on Evaluation Data:", mse_value.numpy())
print("Mean Absolute Error on Evaluation Data:", mae_value.numpy())

# Optional: Visualization of predictions vs actual values
# This is helpful if you want to see how well the model predictions match the actual data
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(y_eval, label='Actual CPU Utilization')
plt.plot(predicted_cpu_utilization.numpy(), label='Predicted CPU Utilization')
plt.title('CPU Utilization: Actual vs Predicted')
plt.xlabel('Samples')
plt.ylabel('CPU Utilization')
plt.legend()
plt.show()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
def _plot_series(series, series_name, series_index=0):
  from matplotlib import pyplot as plt
  import seaborn as sns
  palette = list(sns.palettes.mpl_palette('Dark2'))
  xs = series['timestamp']
  ys = series['providerrpc_rt']

  plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])

fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')
df_sorted = _df_8.sort_values('timestamp', ascending=True)
_plot_series(df_sorted, '')
sns.despine(fig=fig, ax=ax)
plt.xlabel('timestamp')
_ = plt.ylabel('providerrpc_rt')

In [None]:
from matplotlib import pyplot as plt
_df_15['consumerrpc_rt'].plot(kind='line', figsize=(8, 4), title='consumerrpc_rt')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
num_features = 20  # Number of workload metrics
num_graph_nodes = 17303  # Number of nodes in your graph
seq_length = 30  # Length of the temporal sequence

def create_stgnn_model(num_features, num_graph_nodes, seq_length):
    # Input for Graph Structure and Node Features
    A = Input(shape=(num_graph_nodes, num_graph_nodes))  # Adjacency matrix
    X = Input(shape=(seq_length, num_graph_nodes, num_features))  # Node features (time-series data)

    # Process Node Features
    processed_features = TimeDistributed(Dense(32, activation='relu'))(X)
    processed_features = tf.reduce_mean(processed_features, axis=1)  # Example aggregation

    # Graph Convolution Layer
    graph_conv_output = GCNConv(32, activation='relu')([processed_features, A])

    # Temporal Convolution Layer
    temporal_conv_output = Conv1D(filters=16, kernel_size=3, activation='relu')(graph_conv_output)
    temporal_conv_output = LayerNormalization()(temporal_conv_output)

    # LSTM Layer for Temporal Dependencies
    lstm_output = LSTM(64, return_sequences=False)(temporal_conv_output)

    # Output Layer
    output = Dense(1)(lstm_output)

    return Model(inputs=[X, A], outputs=output)

# Example usage


model = create_stgnn_model(num_features, num_graph_nodes, seq_length)
model.compile(optimizer='adam', loss='mse')  # Adjust based on your needs


In [None]:
num_nodes = 10  # Assuming you have 10 nodes
A = np.eye(num_nodes)  # Identity matrix as a placeholder

# Extract and organize node features
# Assuming your features are already in the correct format
features = ['providerrpc_rt',
       'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr', 'writemc_rt',
       'writemc_mcr', 'readmc_rt', 'readmc_mcr', 'writedb_rt', 'writedb_mcr',
       'readdb_rt', 'readdb_mcr', 'consumermq_rt', 'consumermq_mcr',
       'providermq_rt', 'providermq_mcr', 'http_mcr', 'http_rt',
       'cpu_utilization', 'memory_utilization']  # Adjust this based on your actual data columns

# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(df[['providerrpc_rt',
       'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr', 'writemc_rt',
       'writemc_mcr', 'readmc_rt', 'readmc_mcr', 'writedb_rt', 'writedb_mcr',
       'readdb_rt', 'readdb_mcr', 'consumermq_rt', 'consumermq_mcr',
       'providermq_rt', 'providermq_mcr', 'http_mcr', 'http_rt',
       'cpu_utilization', 'memory_utilization']])

# Reshape features to match the input shape expected by the model
# Adjust 'num_graph_nodes' and 'seq_length' based on your data
num_features = len(features)

# You need to define seq_length and num_graph_nodes based on your dataset's structure
seq_length = 1  # For instance, 30 time steps per node
num_graph_nodes = int((18497 * num_features) / (seq_length * num_features))  # Calculate number of nodes

# Verify if the reshaping is feasible
total_elements = 18497 * num_features
required_elements = seq_length * num_graph_nodes * num_features


print(total_elements,required_elements)
if total_elements != required_elements:
    raise ValueError("Cannot reshape: The total number of elements doesn't match the target shape.")

X_reshaped = X.reshape(num_features,num_graph_nodes,)

# Split the dataset into training and validation sets

X_train, X_val,y_train, y_val= train_test_split(X_reshaped, X_reshaped, test_size=0.2, random_state=42)

In [None]:
df.columns

In [None]:
X_reshaped.shape

In [None]:
len(df['msinstanceid'].unique())

In [None]:
df.head()

In [None]:
df.shape

In [None]:
new_df = pd.DataFrame()

In [None]:
new_df['msinstanceid'] = df['msinstanceid'].unique()

In [None]:
new_df.to_csv('ReqMSInstanceIDs.csv')
combinedDf = pd.DataFrame()

In [None]:
msInstaces = []
msInstaces = df['msinstanceid']

In [None]:
call_df0 = pd.read_csv('CallGraph_0.csv')

In [None]:
filtered_df = df_data[call_df0['dminstance'].isin(df_msInstance['msInstance']) & df_data['uminstance'].isin(df_msInstance['msInstance'])]

In [None]:
call_df0.columns

In [None]:
call_df = pd.read_csv("FinalCombinedDf.csv")
reqMS_df = pd.read_csv("ReqMSInstanceIDs.csv")

In [None]:
(call_df['uminstanceid'].value_counts())

In [None]:
(call_df['dminstanceid'].value_counts())

In [None]:
reqMS_df['msinstanceid']

In [None]:
len(input_features),len(df),X.shape

In [None]:
X_train = X_train.reshape(-1, seq_length, num_features)
X_val = X_val.reshape(-1, seq_length, num_features)


In [None]:
df.columns

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
# Assuming you have a Pandas DataFrame called 'df' with your dataset

# Sort the DataFrame by timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df = df.sort_values(by=['timestamp'])

# Assuming 'timestamp' is in datetime format, you can calculate the time difference between consecutive timestamps
df['time_diff'] = (df['timestamp'] - df['timestamp'].shift(1)).dt.total_seconds().fillna(0)

# Define the STGNN model
class STGNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(STGNN, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)  # Update output_size

    def forward(self, x):
        out, _ = self.lstm(x)
        
        # Depending on the LSTM configuration, out may have two or three dimensions
        if len(out.shape) == 3:
            out = self.fc(out[:, -1, :])
        else:
            out = self.fc(out)
        
        return out

# Convert data to PyTorch tensors
X = torch.tensor(df[['providerrpc_rt', 'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr',
                     'writemc_rt', 'writemc_mcr', 'readmc_rt', 'readmc_mcr',
                     'writedb_rt', 'writedb_mcr', 'readdb_rt', 'readdb_mcr',
                     'consumermq_rt', 'consumermq_mcr', 'providermq_rt', 'providermq_mcr',
                     'http_mcr', 'http_rt']].values, dtype=torch.float32)
y = torch.tensor(df[['providerrpc_rt', 'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr',
                     'writemc_rt', 'writemc_mcr', 'readmc_rt', 'readmc_mcr',
                     'writedb_rt', 'writedb_mcr', 'readdb_rt', 'readdb_mcr',
                     'consumermq_rt', 'consumermq_mcr', 'providermq_rt', 'providermq_mcr',
                     'http_mcr', 'http_rt']].values, dtype=torch.float32)  # Change the target metric as needed

# Create a DataLoader
X_train, X_test, y_train, y_test = train_test_split(X.numpy(), y.numpy(), test_size=0.2, random_state=42)

# Convert back to PyTorch tensors
X_train, X_test, y_train, y_test = map(torch.tensor, (X_train, X_test, y_train, y_test))

# Create a DataLoader for training and testing sets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Evaluate the model on the test set
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        for inputs, labels in test_dataloader:
            outputs = model(inputs)
            test_loss += criterion(outputs, labels).item()

        average_test_loss = test_loss / len(test_dataloader)
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {loss.item()}, Test Loss: {average_test_loss}')


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Assuming you have already defined your model, criterion, and optimizer

# Convert data to PyTorch tensors
X = torch.tensor(df[['providerrpc_rt', 'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr',
                     'writemc_rt', 'writemc_mcr', 'readmc_rt', 'readmc_mcr',
                     'writedb_rt', 'writedb_mcr', 'readdb_rt', 'readdb_mcr',
                     'consumermq_rt', 'consumermq_mcr', 'providermq_rt', 'providermq_mcr',
                     'http_mcr', 'http_rt']].values, dtype=torch.float32)

# Include all metric columns in the target variable 'y'
# y = torch.tensor(df[['providerrpc_rt', 'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr',
#                      'writemc_rt', 'writemc_mcr', 'readmc_rt', 'readmc_mcr',
#                      'writedb_rt', 'writedb_mcr', 'readdb_rt', 'readdb_mcr',
#                      'consumermq_rt', 'consumermq_mcr', 'providermq_rt', 'providermq_mcr',
#                      'http_mcr', 'http_rt']].values, dtype=torch.float32)
y = torch.tensor(df[['providerrpc_rt']].values, dtype=torch.float32)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X.numpy(), y.numpy(), test_size=0.1, random_state=42)

# Convert back to PyTorch tensors
X_train, X_test, y_train, y_test = map(torch.tensor, (X_train, X_test, y_train, y_test))

# Create a DataLoader for training and testing sets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Train the model
num_epochs = 10
all_actual_values = []
all_predicted_values = []

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Evaluate the model on the test set
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        batch_actual_values = []
        batch_predicted_values = []

        for inputs, labels in test_dataloader:
            outputs = model(inputs)

            # Store actual and predicted values for plotting
            batch_actual_values.extend(labels.numpy())
            batch_predicted_values.extend(outputs.numpy())

            test_loss += criterion(outputs, labels).item()

        average_test_loss = test_loss / len(test_dataloader)
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {loss.item()}, Test Loss: {average_test_loss}')

        # Store actual and predicted values for each epoch
        all_actual_values.append(np.array(batch_actual_values))
        all_predicted_values.append(np.array(batch_predicted_values))

# Flatten the lists to get a single array for actual and predicted values
print(len(actual_values))
actual_values = np.concatenate(all_actual_values, axis=0)
predicted_values = np.concatenate(all_predicted_values, axis=0)  # Flatten predicted values to 1D
subsample_size = 100
indices = np.random.choice(len(actual_values), size=subsample_size, replace=False)

# Plot actual and predicted values as line graphs
plt.figure(figsize=(10, 6))
plt.plot(actual_values[indices], label='Actual', marker='o')
plt.plot(predicted_values[indices], label='Predicted', marker='o')
plt.title('Actual vs Predicted Values (Subsampled)')
plt.xlabel('Index')
plt.ylabel('Values')
plt.legend()
plt.show()
# Plot actual vs predicted values
# plt.figure(figsize=(10, 6))
# plt.plot(actual_values, label='Actual', marker='o')
# plt.plot(predicted_values, label='Predicted', marker='o', linestyle='--')
# plt.title('Actual vs Predicted Values')
# plt.xlabel('Index')
# plt.ylabel('Values')
# plt.legend()
# plt.show()


In [None]:
import pandas as pd

# Assuming 'timestamp' is in a string format that can be parsed
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort the DataFrame by timestamp
df = df.sort_values(by=['timestamp'])

# Calculate the time difference between consecutive timestamps
df['time_diff'] = (df['timestamp'] - df['timestamp'].shift(1)).dt.total_seconds().fillna(0)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Assuming you have a Pandas DataFrame called 'df' with your dataset

# Sort the DataFrame by timestamp

# Assuming 'timestamp' is in datetime format, you can calculate the time difference between consecutive timestamps
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort the DataFrame by timestamp
df = df.sort_values(by=['timestamp'])

# Calculate the time difference between consecutive timestamps
df['time_diff'] = (df['timestamp'] - df['timestamp'].shift(1)).dt.total_seconds().fillna(0)

# Define the STGNN model
class STGNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(STGNN, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        
        # Depending on the LSTM configuration, out may have two or three dimensions
        if len(out.shape) == 3:
            out = self.fc(out[:, -1, :])
        else:
            out = self.fc(out)
        
        return out

# Convert data to PyTorch tensors
X = torch.tensor(df[['time_diff','providerrpc_rt', 'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr',
                     'writemc_rt', 'writemc_mcr', 'readmc_rt', 'readmc_mcr',
                     'writedb_rt', 'writedb_mcr', 'readdb_rt', 'readdb_mcr',
                     'consumermq_rt', 'consumermq_mcr', 'providermq_rt', 'providermq_mcr',
                     'http_mcr', 'http_rt']].values, dtype=torch.float32)
y = torch.tensor(df[['providerrpc_rt']].values, dtype=torch.float32)  # Change the target metric as needed

# Create a DataLoader
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Instantiate the model, loss function, and optimizer
model = STGNN(input_size=X.shape[1], hidden_size=64, output_size=1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Now you can use the trained model to make predictions for the next timestamp


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Assuming you have already defined your model, criterion, and optimizer

# Convert data to PyTorch tensors
X = torch.tensor(df[['time_diff','providerrpc_rt', 'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr',
                     'writemc_rt', 'writemc_mcr', 'readmc_rt', 'readmc_mcr',
                     'writedb_rt', 'writedb_mcr', 'readdb_rt', 'readdb_mcr',
                     'consumermq_rt', 'consumermq_mcr', 'providermq_rt', 'providermq_mcr',
                     'http_mcr', 'http_rt']].values, dtype=torch.float32)

# Include all metric columns in the target variable 'y'
# y = torch.tensor(df[['providerrpc_rt', 'providerrpc_mcr', 'consumerrpc_rt', 'consumerrpc_mcr',
#                      'writemc_rt', 'writemc_mcr', 'readmc_rt', 'readmc_mcr',
#                      'writedb_rt', 'writedb_mcr', 'readdb_rt', 'readdb_mcr',
#                      'consumermq_rt', 'consumermq_mcr', 'providermq_rt', 'providermq_mcr',
#                      'http_mcr', 'http_rt']].values, dtype=torch.float32)
y = torch.tensor(df[['providerrpc_rt']].values, dtype=torch.float32)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X.numpy(), y.numpy(), test_size=0.1, random_state=42)

# Convert back to PyTorch tensors
X_train, X_test, y_train, y_test = map(torch.tensor, (X_train, X_test, y_train, y_test))

# Create a DataLoader for training and testing sets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Train the model
num_epochs = 100
all_actual_values = []
all_predicted_values = []

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Evaluate the model on the test set
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        batch_actual_values = []
        batch_predicted_values = []

        for inputs, labels in test_dataloader:
            outputs = model(inputs)

            # Store actual and predicted values for plotting
            batch_actual_values.extend(labels.numpy())
            batch_predicted_values.extend(outputs.numpy())

            test_loss += criterion(outputs, labels).item()

        average_test_loss = test_loss / len(test_dataloader)
        # print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {loss.item()}, Test Loss: {average_test_loss}')

        # Store actual and predicted values for each epoch
        all_actual_values.append(np.array(batch_actual_values))
        all_predicted_values.append(np.array(batch_predicted_values))

# Flatten the lists to get a single array for actual and predicted values
print(len(actual_values))
actual_values = np.concatenate(all_actual_values, axis=0)
predicted_values = np.concatenate(all_predicted_values, axis=0)  # Flatten predicted values to 1D
subsample_size = 100
indices = np.random.choice(len(actual_values), size=subsample_size, replace=False)

# Plot actual and predicted values as line graphs
plt.figure(figsize=(10, 6))
plt.plot(actual_values[indices], label='Actual', marker='o')
plt.plot(predicted_values[indices], label='Predicted', marker='o')
plt.title('Actual vs Predicted Values (Subsampled)')
plt.xlabel('Index')
plt.ylabel('Values')
plt.legend()
plt.show()
# Plot actual vs predicted values
# plt.figure(figsize=(10, 6))
# plt.plot(actual_values, label='Actual', marker='o')
# plt.plot(predicted_values, label='Predicted', marker='o', linestyle='--')
# plt.title('Actual vs Predicted Values')
# plt.xlabel('Index')
# plt.ylabel('Values')
# plt.legend()
# plt.show()
