In [None]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the datasets
patterns_df = pd.read_csv('patterns.csv')
requests_df = pd.read_csv('requests.csv')

# Preprocess the data
# Assuming the 'Pattern' column contains the sequences to be used for training the LSTM
# Convert the 'Pattern' column to a list of lists
patterns_df['Pattern'] = patterns_df['Pattern'].apply(eval)

# Convert categorical data to numerical data
encoder = LabelEncoder()
patterns_df['Pattern'] = patterns_df['Pattern'].apply(lambda x: encoder.fit_transform(x))

# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
patterns_df['Pattern'] = patterns_df['Pattern'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)))

# Pad the sequences to have the same length
patterns_list = patterns_df['Pattern'].tolist()
patterns_list = pad_sequences(patterns_list, dtype='float32', padding='post')

# Reshape the input into the form [samples, time steps, features]
patterns = np.array(patterns_list).reshape(-1, patterns_list.shape[1], 1)

# Assuming the 'Rack_id' and 'Datanode' columns are to be used for prediction
encoder.fit(list(patterns_df['Rack_id'].unique()) + list(patterns_df['Datanode'].unique()))
patterns_df['Rack_id'] = encoder.transform(patterns_df['Rack_id']).astype(np.int32)
patterns_df['Datanode'] = encoder.transform(patterns_df['Datanode']).astype(np.int32)

# Define the LSTM Model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(patterns_list.shape[1], 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Assuming the 'Frequency' column is the target variable
target = patterns_df['Rack_id'].values

# Train the model
model.fit(patterns, target, epochs=200, verbose=0)

# Preprocess the test data
num_samples = len(list(set(requests_df['Rack_id'])) + list(set(requests_df['Datanode'])))
requests = np.array(list(set(requests_df['Rack_id'])) + list(set(requests_df['Datanode'])))
requests = requests.reshape(-1, 1)


# Preprocess the test data
unique_rack_ids_requests = np.unique(requests_df['Rack_id']).astype(str)
unique_rack_ids_patterns = np.unique(patterns_df['Rack_id']).astype(str)
# Add 'R' prefix to rack IDs in patterns_df
# Convert rack IDs to strings before adding prefix
patterns_df['Rack_id'] = 'R' + patterns_df['Rack_id'].astype(str)

# Filter unique_rack_ids to only include rack IDs present in patterns_df_grouped
patterns_df_grouped = patterns_df.groupby('Rack_id')['Frequency'].mean().reset_index()
common_rack_ids = np.intersect1d(unique_rack_ids_requests, patterns_df_grouped['Rack_id'].astype(str))

if len(common_rack_ids) == 0:
    raise ValueError("No common rack IDs present in both requests_df and patterns_df_grouped.")

# Create a dummy input for the model to predict
dummy_input = np.repeat(encoder.transform(common_rack_ids)[:, None, None], 14, axis=1)

# Normalize the input data
dummy_input = scaler.transform(dummy_input.reshape(-1, 1))

# Reshape the input data for the model
dummy_input = dummy_input.reshape(len(common_rack_ids), 14, 1)

# Use the model to predict the test data
predictions = model.predict(dummy_input)
print("Predictions:")
print(predictions)
# Calculate the actual values for the test data
actual_values = patterns_df_grouped.loc[patterns_df_grouped['Rack_id'].astype(str).isin(common_rack_ids), 'Frequency'].values

# Calculate the Mean Squared Error
mse = mean_squared_error(actual_values, predictions.flatten())

# Calculate the Root Mean Squared Error
rmse = np.sqrt(mse)

mae = mean_absolute_error(actual_values, predictions.flatten())
print("MEAN SQUARED ERROR",mse)
print("ROOT MEAN SQUARED ERROR: ",rmse)
print("MEAN ABSOLUTE ERROR: ",mae)

Predictions:
[[18.849411]]
MEAN SQUARED ERROR 1066.4988675572083
ROOT MEAN SQUARED ERROR:  32.65729424733789
MEAN ABSOLUTE ERROR:  32.65729424733789
