In [None]:
# Autoencoders for Anomaly Detection

# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers
import tensorflow as tf

# Step 2: Load the Diabetes Dataset
diabetes = load_diabetes()
data = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)
data['target'] = diabetes.target

# Step 3: Select Features for Analysis
selected_features = ['age', 'bmi', 'bp']
df = data[selected_features].copy()

# Step 4: Standardize the Features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Step 5: Split Data into Training and Test Sets
X_train, X_test = train_test_split(df_scaled, test_size=0.2, random_state=42)

# Step 6: Define the Autoencoder Model
input_dim = X_train.shape[1]
encoding_dim = 2  # Size of the encoded representation

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu",
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
decoder = Dense(input_dim, activation='linear')(encoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

# Step 7: Compile the Model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Step 8: Train the Autoencoder
history = autoencoder.fit(X_train, X_train,
                          epochs=100,
                          batch_size=16,
                          shuffle=True,
                          validation_data=(X_test, X_test),
                          verbose=0)

# Step 9: Calculate Reconstruction Error on the Test Set
X_test_pred = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - X_test_pred, 2), axis=1)

# Step 10: Determine Threshold for Anomaly Detection
threshold = np.percentile(mse, 95)  # 95th percentile
print("Reconstruction error threshold:", threshold)

# Step 11: Identify Anomalies
anomalies = mse > threshold
print("Number of anomalies detected:", np.sum(anomalies))

# Step 12: Visualize Anomalies
plt.figure(figsize=(12, 6))
plt.scatter(X_test[:, 1], X_test[:, 2], c=anomalies, cmap='coolwarm')
plt.xlabel('BMI (standardized)')
plt.ylabel('BP (standardized)')
plt.title('Anomaly Detection using Autoencoder')
plt.grid(True)
plt.show()