### Step 1: Load and Combine Datasets

In [27]:
import pandas as pd

logon_df = pd.read_csv("../../autonomous-cybersecurity/datasets/r1/logon.csv", nrows=1000)
device_df = pd.read_csv("../../autonomous-cybersecurity/datasets/r1/device.csv", nrows=1000)
http_df = pd.read_csv("../../autonomous-cybersecurity/datasets/r1/http.csv", nrows=1000)

In [28]:
# Display the first few rows of each dataframe
print("Logon DataFrame:")
print(logon_df.head())

print("Device DataFrame:")
print(device_df.head())

print("HTTP DataFrame:")
print(http_df.head())

Logon DataFrame:
                         id                 date          user       pc  \
0  {Y6O4-A7KC67IN-0899AOZK}  01/04/2010 00:10:37  DTAA/KEE0997  PC-1914   
1  {O5Y6-O7CJ02JC-6704RWBS}  01/04/2010 00:52:16  DTAA/KEE0997  PC-1914   
2  {D2D1-C6EB14QJ-2100RSZO}  01/04/2010 01:17:20  DTAA/KEE0997  PC-3363   
3  {H9W1-X0MC70BT-6065RPAT}  01/04/2010 01:28:34  DTAA/KEE0997  PC-3363   
4  {H3H4-S5AZ00AZ-9560IYHC}  01/04/2010 01:57:30  DTAA/BJM0992  PC-3058   

  activity  
0    Logon  
1   Logoff  
2    Logon  
3   Logoff  
4    Logon  
Device DataFrame:
                         id                 date          user       pc  \
0  {S7A7-Y8QZ65MW-8738SAZP}  01/04/2010 07:12:31  DTAA/RES0962  PC-3736   
1  {G7A8-G1OB94NR-3006NTXH}  01/04/2010 07:35:40  DTAA/BJC0569  PC-2588   
2  {R3L8-N0LW95FR-8358LLXS}  01/04/2010 08:00:38  DTAA/EMZ0196  PC-1479   
3  {I2F1-B5FB51FL-3128HBUL}  01/04/2010 08:02:14  DTAA/ZKH0388  PC-1021   
4  {P7R6-C5TV18CT-1677DWWM}  01/04/2010 08:20:17  DTAA/RES096

### Step 2: Data Preprocessing

- Convert dates to datetime objects.
- Encode categorical variables.
- Handle missing values if any.
- Combine datasets into a single dataframe.

In [29]:
print(logon_df.head(2))
logon_df.info()

                         id                 date          user       pc  \
0  {Y6O4-A7KC67IN-0899AOZK}  01/04/2010 00:10:37  DTAA/KEE0997  PC-1914   
1  {O5Y6-O7CJ02JC-6704RWBS}  01/04/2010 00:52:16  DTAA/KEE0997  PC-1914   

  activity  
0    Logon  
1   Logoff  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        1000 non-null   object
 1   date      1000 non-null   object
 2   user      1000 non-null   object
 3   pc        1000 non-null   object
 4   activity  1000 non-null   object
dtypes: object(5)
memory usage: 39.2+ KB


In [30]:
# Convert 'date' to datetime

In [31]:
logon_df['date'] = pd.to_datetime(logon_df['date'])

In [32]:
print(logon_df.head(2))
logon_df.info()

                         id                date          user       pc  \
0  {Y6O4-A7KC67IN-0899AOZK} 2010-01-04 00:10:37  DTAA/KEE0997  PC-1914   
1  {O5Y6-O7CJ02JC-6704RWBS} 2010-01-04 00:52:16  DTAA/KEE0997  PC-1914   

  activity  
0    Logon  
1   Logoff  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        1000 non-null   object        
 1   date      1000 non-null   datetime64[ns]
 2   user      1000 non-null   object        
 3   pc        1000 non-null   object        
 4   activity  1000 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 39.2+ KB


In [33]:
device_df['date'] = pd.to_datetime(device_df['date'])
http_df['date'] = pd.to_datetime(http_df['date'])

In [34]:
# Encode categorical variables

In [35]:
logon_df['activity'] = logon_df['activity'].map({'Logon': 1, 'Logoff': 0})
device_df['activity'] = device_df['activity'].map({'Connect': 1, 'Disconnect': 0})

In [36]:
logon_df.head(), device_df.head()

(                         id                date          user       pc  \
 0  {Y6O4-A7KC67IN-0899AOZK} 2010-01-04 00:10:37  DTAA/KEE0997  PC-1914   
 1  {O5Y6-O7CJ02JC-6704RWBS} 2010-01-04 00:52:16  DTAA/KEE0997  PC-1914   
 2  {D2D1-C6EB14QJ-2100RSZO} 2010-01-04 01:17:20  DTAA/KEE0997  PC-3363   
 3  {H9W1-X0MC70BT-6065RPAT} 2010-01-04 01:28:34  DTAA/KEE0997  PC-3363   
 4  {H3H4-S5AZ00AZ-9560IYHC} 2010-01-04 01:57:30  DTAA/BJM0992  PC-3058   
 
    activity  
 0         1  
 1         0  
 2         1  
 3         0  
 4         1  ,
                          id                date          user       pc  \
 0  {S7A7-Y8QZ65MW-8738SAZP} 2010-01-04 07:12:31  DTAA/RES0962  PC-3736   
 1  {G7A8-G1OB94NR-3006NTXH} 2010-01-04 07:35:40  DTAA/BJC0569  PC-2588   
 2  {R3L8-N0LW95FR-8358LLXS} 2010-01-04 08:00:38  DTAA/EMZ0196  PC-1479   
 3  {I2F1-B5FB51FL-3128HBUL} 2010-01-04 08:02:14  DTAA/ZKH0388  PC-1021   
 4  {P7R6-C5TV18CT-1677DWWM} 2010-01-04 08:20:17  DTAA/RES0962  PC-3736   
 
    a

In [37]:
logon_df.info(), device_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        1000 non-null   object        
 1   date      1000 non-null   datetime64[ns]
 2   user      1000 non-null   object        
 3   pc        1000 non-null   object        
 4   activity  1000 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 39.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        1000 non-null   object        
 1   date      1000 non-null   datetime64[ns]
 2   user      1000 non-null   object        
 3   pc        1000 non-null   object        
 4   activity  1000 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 39.2+ KB


(None, None)

In [38]:
# df.rename(columns={'old_name': 'new_name'}, inplace=True)
# device_df['activity'] = device_df.rename(columns={'activity':'logon'})
device_df.rename(columns={'activity': 'logon'}, inplace=True)


In [39]:
device_df.head()

Unnamed: 0,id,date,user,pc,logon
0,{S7A7-Y8QZ65MW-8738SAZP},2010-01-04 07:12:31,DTAA/RES0962,PC-3736,1
1,{G7A8-G1OB94NR-3006NTXH},2010-01-04 07:35:40,DTAA/BJC0569,PC-2588,1
2,{R3L8-N0LW95FR-8358LLXS},2010-01-04 08:00:38,DTAA/EMZ0196,PC-1479,1
3,{I2F1-B5FB51FL-3128HBUL},2010-01-04 08:02:14,DTAA/ZKH0388,PC-1021,1
4,{P7R6-C5TV18CT-1677DWWM},2010-01-04 08:20:17,DTAA/RES0962,PC-3736,0


In [40]:
# Combine datasets into a single dataframe
combined_df = pd.concat([logon_df, device_df, http_df], ignore_index=True)

In [41]:
http_df.head()

Unnamed: 0,id,date,user,pc,url
0,{M8H9-W9NL75TH-1322KOLO},2010-01-04 07:08:47,DTAA/AMA0606,PC-1514,http://cnet.com
1,{V0E1-R0FE91SC-2381GTDZ},2010-01-04 07:35:19,DTAA/DBM0698,PC-1444,http://force.open.com
2,{V7C8-H6KA39YG-3352HMOW},2010-01-04 07:40:40,DTAA/HBF0035,PC-0991,http://tendencystem.org
3,{O9P7-D9DL54YQ-9903OMYE},2010-01-04 07:52:25,DTAA/SVC0175,PC-3384,http://bent.cup.org
4,{L3J0-E4RF05KN-7341OQVW},2010-01-04 07:55:50,DTAA/CMU0028,PC-4253,http://taxpotato.east.org


In [42]:
combined_df.sort_values(by='date')

Unnamed: 0,id,date,user,pc,activity,logon,url
0,{Y6O4-A7KC67IN-0899AOZK},2010-01-04 00:10:37,DTAA/KEE0997,PC-1914,1.0,,
1,{O5Y6-O7CJ02JC-6704RWBS},2010-01-04 00:52:16,DTAA/KEE0997,PC-1914,0.0,,
2,{D2D1-C6EB14QJ-2100RSZO},2010-01-04 01:17:20,DTAA/KEE0997,PC-3363,1.0,,
3,{H9W1-X0MC70BT-6065RPAT},2010-01-04 01:28:34,DTAA/KEE0997,PC-3363,0.0,,
4,{H3H4-S5AZ00AZ-9560IYHC},2010-01-04 01:57:30,DTAA/BJM0992,PC-3058,1.0,,
...,...,...,...,...,...,...,...
1995,{Y2Z6-S3BV67MD-4672RZTR},2010-01-11 09:26:05,DTAA/HSD0519,PC-2047,,1.0,
1996,{G5Q8-B4BL73BX-8970DNYT},2010-01-11 09:27:09,DTAA/VKB0605,PC-2644,,0.0,
1997,{Y4Y2-U8UO01KX-1646FXTY},2010-01-11 09:31:37,DTAA/JAM0392,PC-0209,,0.0,
1998,{J5P8-C6AZ60QE-4450PKDY},2010-01-11 09:37:02,DTAA/KKW0676,PC-4445,,1.0,


In [43]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        3000 non-null   object        
 1   date      3000 non-null   datetime64[ns]
 2   user      3000 non-null   object        
 3   pc        3000 non-null   object        
 4   activity  1000 non-null   float64       
 5   logon     1000 non-null   float64       
 6   url       1000 non-null   object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 164.2+ KB


In [44]:
combined_df['url'] = combined_df['url'].fillna('http://example.com')

In [45]:
combined_df['logon'] = combined_df['logon'].fillna(1)

In [46]:
combined_df['activity'] = combined_df['activity'].fillna(1)

In [47]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        3000 non-null   object        
 1   date      3000 non-null   datetime64[ns]
 2   user      3000 non-null   object        
 3   pc        3000 non-null   object        
 4   activity  3000 non-null   float64       
 5   logon     3000 non-null   float64       
 6   url       3000 non-null   object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 164.2+ KB


In [48]:
combined_df

Unnamed: 0,id,date,user,pc,activity,logon,url
0,{Y6O4-A7KC67IN-0899AOZK},2010-01-04 00:10:37,DTAA/KEE0997,PC-1914,1.0,1.0,http://example.com
1,{O5Y6-O7CJ02JC-6704RWBS},2010-01-04 00:52:16,DTAA/KEE0997,PC-1914,0.0,1.0,http://example.com
2,{D2D1-C6EB14QJ-2100RSZO},2010-01-04 01:17:20,DTAA/KEE0997,PC-3363,1.0,1.0,http://example.com
3,{H9W1-X0MC70BT-6065RPAT},2010-01-04 01:28:34,DTAA/KEE0997,PC-3363,0.0,1.0,http://example.com
4,{H3H4-S5AZ00AZ-9560IYHC},2010-01-04 01:57:30,DTAA/BJM0992,PC-3058,1.0,1.0,http://example.com
...,...,...,...,...,...,...,...
2995,{O7R6-P2YK82WK-2869VXDR},2010-01-06 11:00:59,DTAA/ALW0694,PC-0415,1.0,1.0,http://google.com
2996,{E4U0-K1QA90RO-7645BJGP},2010-01-06 11:01:03,DTAA/CGR0202,PC-2454,1.0,1.0,http://warriorforum.com
2997,{M1R4-B3IJ08VN-5158JGQA},2010-01-06 11:01:07,DTAA/CPP0761,PC-2665,1.0,1.0,http://yahoo.com
2998,{Q7Q0-B1UL35ZP-4354ZDKZ},2010-01-06 11:01:19,DTAA/AUM0365,PC-0338,1.0,1.0,http://google.com


### Step 3: Feature Engineering
Create features for the model:
- Aggregate actions per user over a time window.
- Extract additional features if necessary.


In [49]:
# Aggregate actions per user over a time window (e.g., daily)
combined_df['date'] = combined_df['date'].dt.date  # Extract date part only

In [50]:
# Aggregate data
aggregated_df = combined_df.groupby(['date', 'user', 'pc']).agg({
    'logon': 'sum',
    'activity': 'sum',  # Count of actions
    'url': 'count'  # Number of HTTP requests
}).reset_index()

In [51]:
aggregated_df

Unnamed: 0,date,user,pc,logon,activity,url
0,2010-01-04,DTAA/AAA0371,PC-1689,2.0,2.0,2
1,2010-01-04,DTAA/AAC0344,PC-2282,4.0,4.0,4
2,2010-01-04,DTAA/AAC0599,PC-1800,2.0,2.0,2
3,2010-01-04,DTAA/AAH0734,PC-0280,1.0,1.0,1
4,2010-01-04,DTAA/AAK0658,PC-3233,1.0,1.0,1
...,...,...,...,...,...,...
1791,2010-01-11,DTAA/RBC0879,PC-1261,1.0,1.0,1
1792,2010-01-11,DTAA/RCW0436,PC-3171,1.0,1.0,1
1793,2010-01-11,DTAA/SKW0871,PC-1529,1.0,1.0,1
1794,2010-01-11,DTAA/VFP0669,PC-1717,1.0,1.0,1


In [52]:
aggregated_df.rename(columns={'activity': 'total_activity', 'url': 'total_http_requests'}, inplace=True)

In [53]:
aggregated_df

Unnamed: 0,date,user,pc,logon,total_activity,total_http_requests
0,2010-01-04,DTAA/AAA0371,PC-1689,2.0,2.0,2
1,2010-01-04,DTAA/AAC0344,PC-2282,4.0,4.0,4
2,2010-01-04,DTAA/AAC0599,PC-1800,2.0,2.0,2
3,2010-01-04,DTAA/AAH0734,PC-0280,1.0,1.0,1
4,2010-01-04,DTAA/AAK0658,PC-3233,1.0,1.0,1
...,...,...,...,...,...,...
1791,2010-01-11,DTAA/RBC0879,PC-1261,1.0,1.0,1
1792,2010-01-11,DTAA/RCW0436,PC-3171,1.0,1.0,1
1793,2010-01-11,DTAA/SKW0871,PC-1529,1.0,1.0,1
1794,2010-01-11,DTAA/VFP0669,PC-1717,1.0,1.0,1


### Step 4: Data Preparation for Model
Prepare the data for the LSTM and CNN models.

In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed, Dense, Conv1D, MaxPooling1D, Flatten


2024-07-05 10:41:15.760155: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Step 5: Train and Evaluate Models
Use the prepared data to train and evaluate the LSTM and CNN models.

In [55]:
# Define LSTM Autoencoder model
def create_lstm_autoencoder(input_shape):
    model = Sequential()
    model.add(LSTM(64, activation='relu', input_shape=(input_shape[1], input_shape[2]), return_sequences=True))
    model.add(LSTM(32, activation='relu', return_sequences=False))
    model.add(RepeatVector(input_shape[1]))
    model.add(LSTM(32, activation='relu', return_sequences=True))
    model.add(LSTM(64, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(input_shape[2])))
    model.compile(optimizer='adam', loss='mse')
    return model

# Define CNN model
def create_cnn(input_shape):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(input_shape[1], input_shape[2], input_shape[3])))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [56]:
# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(aggregated_df[['total_activity', 'total_http_requests']])

# Debug: Check the shape of scaled_features
print("Shape of scaled_features:", scaled_features.shape)

# Reshape for LSTM
timesteps = 10  # Number of timesteps for LSTM
n_features = scaled_features.shape[1]

X_lstm = []
y_lstm = []

for i in range(timesteps, len(scaled_features)):
    X_lstm.append(scaled_features[i-timesteps:i])
    y_lstm.append(scaled_features[i])

X_lstm = np.array(X_lstm)
y_lstm = np.array(y_lstm)

# Debug: Check the shapes of X_lstm and y_lstm
print("Shape of X_lstm:", X_lstm.shape)
print("Shape of y_lstm:", y_lstm.shape)

# Ensure that X_lstm and y_lstm have sufficient data
if X_lstm.shape[0] == 0 or X_lstm.shape[1] == 0 or X_lstm.shape[2] == 0:
    raise ValueError("Insufficient data for LSTM sequences. Check the preprocessing steps and data sufficiency.")

# Reshape for CNN (if needed)
X_cnn = X_lstm.reshape((X_lstm.shape[0], X_lstm.shape[1], n_features, 1))

Shape of scaled_features: (1796, 2)
Shape of X_lstm: (1786, 10, 2)
Shape of y_lstm: (1786, 2)


In [68]:
shape = (X_lstm.shape[1], X_lstm.shape[2])
shape[1]
shape[2]

IndexError: tuple index out of range

In [66]:
# Train LSTM autoencoder
lstm_autoencoder = create_lstm_autoencoder((X_lstm.shape))
lstm_autoencoder.fit(X_lstm, y_lstm, epochs=50, batch_size=32, validation_split=0.2)

# Train CNN classifier
cnn_model = create_cnn((X_cnn.shape[1], X_cnn.shape[2], X_cnn.shape[3]))
cnn_model.fit(X_cnn, y_lstm, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50


2024-07-05 10:46:09.048752: I tensorflow/core/common_runtime/executor.cc:1197] [/job:localhost/replica:0/task:0/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: Incompatible shapes: [32,2] vs. [32,10,2]
	 [[{{node mean_squared_error/SquaredDifference}}]]


InvalidArgumentError: Graph execution error:

Detected at node 'mean_squared_error/SquaredDifference' defined at (most recent call last):
    File "<frozen runpy>", line 198, in _run_module_as_main
    File "<frozen runpy>", line 88, in _run_code
    File "/home/windass/.local/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "/home/windass/.local/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
      app.start()
    File "/home/windass/.local/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
      self.io_loop.start()
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/asyncio/base_events.py", line 607, in run_forever
      self._run_once()
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once
      handle._run()
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/windass/.local/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "/home/windass/.local/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "/home/windass/.local/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
      await result
    File "/home/windass/.local/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "/home/windass/.local/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "/home/windass/.local/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "/home/windass/.local/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell
      result = self._run_cell(
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell
      result = runner(coro)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_3349/4037518324.py", line 3, in <module>
      lstm_autoencoder.fit(X_lstm, y_lstm, epochs=50, batch_size=32, validation_split=0.2)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/engine/training.py", line 1051, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/engine/training.py", line 1109, in compute_loss
      return self.compiled_loss(
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/engine/compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/losses.py", line 142, in __call__
      losses = call_fn(y_true, y_pred)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/losses.py", line 268, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/windass/miniconda3/envs/autoencoder/lib/python3.11/site-packages/keras/losses.py", line 1470, in mean_squared_error
      return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)
Node: 'mean_squared_error/SquaredDifference'
Incompatible shapes: [32,2] vs. [32,10,2]
	 [[{{node mean_squared_error/SquaredDifference}}]] [Op:__inference_train_function_14395]

In [44]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(aggregated_df[['total_activity', 'total_http_requests']])

# Debug: Check the shape of scaled_features
print("Shape of scaled_features:", scaled_features.shape)

# Reshape for LSTM
timesteps = 10  # Number of timesteps for LSTM
n_features = scaled_features.shape[1]

X_lstm = []
y_lstm = []

for i in range(timesteps, len(scaled_features)):
    X_lstm.append(scaled_features[i-timesteps:i])
    y_lstm.append(scaled_features[i])

X_lstm = np.array(X_lstm)
y_lstm = np.array(y_lstm)

# Debug: Check the shapes of X_lstm and y_lstm
print("Shape of X_lstm:", X_lstm.shape)
print("Shape of y_lstm:", y_lstm.shape)

# Ensure that X_lstm and y_lstm have sufficient data
if X_lstm.shape[0] == 0 or X_lstm.shape[1] == 0 or X_lstm.shape[2] == 0:
    raise ValueError("Insufficient data for LSTM sequences. Check the preprocessing steps and data sufficiency.")

# Reshape for CNN (if needed)
X_cnn = X_lstm.reshape((X_lstm.shape[0], X_lstm.shape[1], n_features, 1))

# Define LSTM Autoencoder model
def create_lstm_autoencoder(input_shape):
    model = Sequential()
    model.add(LSTM(64, activation='relu', input_shape=(input_shape[1], input_shape[2]), return_sequences=True))
    model.add(LSTM(32, activation='relu', return_sequences=False))
    model.add(RepeatVector(input_shape[1]))
    model.add(LSTM(32, activation='relu', return_sequences=True))
    model.add(LSTM(64, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(input_shape[2])))
    model.compile(optimizer='adam', loss='mse')
    return model

# Define CNN model
def create_cnn(input_shape):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(input_shape[1], input_shape[2], input_shape[3])))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train LSTM autoencoder
lstm_autoencoder = create_lstm_autoencoder((X_lstm.shape[1], X_lstm.shape[2]))
lstm_autoencoder.fit(X_lstm, y_lstm, epochs=50, batch_size=32, validation_split=0.2)

# Train CNN classifier
cnn_model = create_cnn((X_cnn.shape[1], X_cnn.shape[2], X_cnn.shape[3]))
cnn_model.fit(X_cnn, y_lstm, epochs=50, batch_size=32, validation_split=0.2)


Shape of scaled_features: (1796, 2)
Shape of X_lstm: (1786, 10, 2)
Shape of y_lstm: (1786, 2)


IndexError: tuple index out of range