In [1]:
import pandas as pd
from imblearn.over_sampling import ADASYN
from sklearn.metrics import precision_recall_curve

In [2]:
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed

In [24]:
df=pd.read_csv("ais_data.csv")

In [25]:
df.dropna(inplace=True)

In [26]:
if 'shiptype' in df.columns and 'shiptype_encoded' not in df.columns:
    df['shiptype_encoded'] = df['shiptype'].astype('category').cat.codes

In [27]:
final_features = ['sog', 'cog', 'shiptype_encoded'] 
df_selected = df[final_features].copy()

In [28]:
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_selected)

In [29]:
isolation_forest = IsolationForest(contamination=0.05, random_state=42)  # Adjust contamination as needed
df['Anomaly'] = isolation_forest.fit_predict(df_selected)

In [30]:
df['Anomaly'] = df['Anomaly'].map({1: 0, -1: 1})

In [31]:
if 'Anomaly' not in df.columns:
    raise KeyError("Error: 'Anomaly' column not found in dataset. Ensure anomaly detection is applied.")

normal_data = df[df['Anomaly'] == 0][final_features].copy()
normal_scaled = scaler.transform(normal_data)

In [32]:
TIME_STEPS = 10  # Define time step window for LSTM
def create_sequences(data, time_steps=TIME_STEPS):
    sequences = []
    for i in range(len(data) - time_steps):
        sequences.append(data[i: (i + time_steps)])
    return np.array(sequences)

train_sequences = create_sequences(normal_scaled)

In [33]:
model = Sequential([
    LSTM(64, activation='relu', input_shape=(TIME_STEPS, len(final_features)), return_sequences=True),
    LSTM(32, activation='relu', return_sequences=False),
    RepeatVector(TIME_STEPS),
    LSTM(32, activation='relu', return_sequences=True),
    LSTM(64, activation='relu', return_sequences=True),
    TimeDistributed(Dense(len(final_features)))
])

  super().__init__(**kwargs)


In [34]:
def weighted_mse(y_true, y_pred):
    weights = tf.where(y_true == 1, 10.0, 1.0)  # Upweight anomalies
    return tf.reduce_mean(weights * tf.square(y_true - y_pred))


In [None]:
model.compile(optimizer='adam', loss=weighted_mse)
model.fit(train_sequences, train_sequences, epochs=20, batch_size=32, validation_split=0.1)

Epoch 1/20
[1m8727/8727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 12ms/step - loss: 0.0598 - val_loss: 0.0248
Epoch 2/20
[1m8727/8727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 12ms/step - loss: 0.0199 - val_loss: 0.0091
Epoch 3/20
[1m8727/8727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 12ms/step - loss: 0.0078 - val_loss: 0.0040
Epoch 4/20
[1m8727/8727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 12ms/step - loss: 0.0036 - val_loss: 0.0026
Epoch 5/20
[1m8727/8727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 12ms/step - loss: 0.0023 - val_loss: 0.0022
Epoch 6/20
[1m8727/8727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 12ms/step - loss: 0.0020 - val_loss: 0.0019
Epoch 7/20
[1m8727/8727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 12ms/step - loss: 0.0013 - val_loss: 8.7720e-04
Epoch 8/20
[1m8727/8727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1205s[0m 138ms/step - loss: 9.2020e-04 - val_los

In [None]:
all_sequences = create_sequences(df_scaled)
predictions = model.predict(all_sequences)

In [None]:
# Compute reconstruction error
mse = np.mean(np.abs(predictions - all_sequences), axis=(1, 2))
threshold = np.percentile(mse, 95)  # Set anomaly threshold

In [None]:
y_true = df['Anomaly'].values[-len(mse):]  # Proxy labels
precision, recall, thresholds = precision_recall_curve(y_true, mse)
f1_scores = (2 * precision * recall) / (precision + recall + 1e-10)
optimal_idx = np.argmax(f1_scores)
threshold = thresholds[optimal_idx]

In [None]:
# Assign LSTM-based anomaly labels
df.loc[df.index[-len(mse):], 'LSTM_Anomaly'] = (mse > threshold).astype(int)

In [None]:
# Replace cell 17 with this code:
# Trim the 'Anomaly' column to match the LSTM_Anomaly length
trimmed_anomaly = df['Anomaly'].iloc[-len(mse):].reset_index(drop=True)

# Combine with weights
df_final = pd.DataFrame({
    'Anomaly': trimmed_anomaly,
    'LSTM_Anomaly': (mse > threshold).astype(int)
})

# Weighted fusion
df_final['Final_Anomaly'] = (0.7 * df_final['Anomaly'] + 0.3 * df_final['LSTM_Anomaly']) > 0.5

# Update the original DataFrame
df = df.iloc[-len(mse):].copy()
df['Final_Anomaly'] = df_final['Final_Anomaly'].values

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Ensure both columns exist and have no NaN values
df = df.dropna(subset=['Anomaly', 'LSTM_Anomaly'])

# Compute accuracy
accuracy = accuracy_score(df['Anomaly'], df['LSTM_Anomaly'])
print(f"Accuracy: {accuracy:.4f}")

# Detailed classification report
print(classification_report(df['Anomaly'], df['LSTM_Anomaly']))

In [None]:
df.to_csv('anomaly_detected_data.csv', index=False)

In [None]:
#Saving the trained LSTM Autoencoder model
model.save("lstm_autoencoder.h5")
print("Model saved successfully.")