In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Assume df contains the full dataset with the 'timestamp' column and 'value' column
url = "https://raw.githubusercontent.com/IKRAMJAAFAR/DeeL/refs/heads/main/dataset.csv?token=GHSAT0AAAAAACZZJM4A75DJZ7QS6RRPDJOOZZDRB5A"
df = pd.read_csv(url)

# Extract the target variable (passenger count) and features (timestamp or time-related features)
values = df['value'].values.reshape(-1, 1)

# Normalize the values for better LSTM training
scaler = MinMaxScaler(feature_range=(0, 1))
values_scaled = scaler.fit_transform(values)

# Function to create sequences of data for LSTM input
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length, 0])
        y.append(data[i+sequence_length, 0])
    return np.array(X), np.array(y)

# Define the sequence length (number of previous time steps to consider for prediction)
sequence_length = 30 

# Create sequences of data
X, y = create_sequences(values_scaled, sequence_length)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Reshape X_train and X_test for LSTM (samples, time steps, features)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [3]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))  # LSTM layer
model.add(Dense(1))  # Output layer for regression
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model without hyperparameter tuning (Preliminary)
model.fit(X_train, y_train, epochs=1, batch_size=64, validation_data=(X_test, y_test))

# Predict on the test data
predictions = model.predict(X_test)

# Compute preliminary evaluation metrics (MSE for this example)
mse = mean_squared_error(y_test, predictions)
print(f"Preliminary MSE: {mse}")

Preliminary MSE: 0.006273112107557844


In [4]:
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed

# Define LSTM Autoencoder
model = Sequential()
# Encoder part
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1), return_sequences=False))
model.add(RepeatVector(X_train.shape[1]))
# Decoder part
model.add(LSTM(64, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(1)))

# Compile model
model.compile(optimizer='adam', loss='mse')

# Train the autoencoder
model.fit(X_train, X_train, epochs=10, batch_size=64, validation_data=(X_test, X_test))

# Predict and compute reconstruction error
reconstruction = model.predict(X_test)
reconstruction_error = np.mean(np.abs(reconstruction - X_test), axis=1)

# Set threshold for anomaly detection
threshold = np.percentile(reconstruction_error, 95)
anomalies = reconstruction_error > threshold
print(f"Anomalies detected: {np.sum(anomalies)}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Anomalies detected: 103


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ConvLSTM2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.layers import Reshape

# Define ConvLSTM model
model = Sequential()
model.add(ConvLSTM2D(filters=64, kernel_size=(3, 3), padding='same', input_shape=(sequence_length, 1, 1, 1), return_sequences=True))
# Optional: Add more ConvLSTM layers if needed
model.add(ConvLSTM2D(filters=64, kernel_size=(3, 3), padding='same', return_sequences=False))
model.add(Flatten())
model.add(Dense(1))  # Regression output (for anomaly score or classification)

model.compile(optimizer='adam', loss='mean_squared_error')

# Reshape input data for ConvLSTM
X_train_reshaped = X_train.reshape((X_train.shape[0], sequence_length, 1, 1, 1))  # Shape (batch, timesteps, height, width, channels)
X_test_reshaped = X_test.reshape((X_test.shape[0], sequence_length, 1, 1, 1))  # Shape (batch, timesteps, height, width, channels)

# Train the model
model.fit(X_train_reshaped, X_train[:, -1], epochs=10, batch_size=64)

# Predict on test data
predictions = model.predict(X_test_reshaped)

# Calculate the reconstruction error
reconstruction_error = np.mean(np.abs(X_test_reshaped[:, :, 0, 0, 0] - predictions), axis=1)

# Set a threshold for anomaly detection
threshold = np.percentile(reconstruction_error, 95)
anomalies = reconstruction_error > threshold
print(f"Anomalies detected: {np.sum(anomalies)}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Anomalies detected: 103


In [6]:
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

url = "https://raw.githubusercontent.com/IKRAMJAAFAR/DeeL/refs/heads/main/dataset.csv?token=GHSAT0AAAAAACZZJM4A75DJZ7QS6RRPDJOOZZDRB5A"
df = pd.read_csv(url)
# Step 1: Convert the 'timestamp' column to datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Step 2: Set 'timestamp' as the index
df.set_index('timestamp', inplace=True)

# Step 3: Ensure pandas understands the frequency of the time series
# For 30-minute frequency, use '30T' (30 minutes)
df = df.asfreq('30T')

# Step 4: Check for missing values and fill them if necessary
if df.isnull().any().any():
    print("Missing values detected, filling with forward fill...")
    df = df.fillna(method='ffill')  # Or use 'bfill' or interpolation based on the context

# Step 5: Apply seasonal decomposition
decomposition = seasonal_decompose(df['value'], model='additive', period=48)  # Adjust 'period' based on your data

# Get the residuals (this part helps with anomaly detection)
residual = decomposition.resid

# Set threshold for anomaly detection based on residuals
threshold = residual.std()  # Use the standard deviation of residuals as a threshold
anomalies = residual.abs() > threshold  # Detect anomalies

# Count the anomalies
num_anomalies = np.sum(anomalies)
print(f'Anomalies detected: {num_anomalies}')

Anomalies detected: 2830


In [7]:
from sklearn.svm import OneClassSVM
import numpy as np

# Assume X_train and X_test are already prepared (flattened if necessary)
X_train_reshaped = X_train.reshape((X_train.shape[0], -1))  # Flatten data if necessary
X_test_reshaped = X_test.reshape((X_test.shape[0], -1))

# Train an SVM model for anomaly detection
svm = OneClassSVM(nu=0.05, kernel='rbf', gamma='auto')
svm.fit(X_train_reshaped)

# Predict on the test data
predictions = svm.predict(X_test_reshaped)
anomalies = predictions == -1  # SVM labels -1 as anomalies and 1 as normal

# Print results
print(f"Anomalies detected: {np.sum(anomalies)}")


Anomalies detected: 383
