In [3]:
# Install necessary libraries
!pip install numpy pandas scikit-learn tensorflow



In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda, BatchNormalization
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
import pickle

from google.colab import files

# Upload the updated notebook
uploaded = files.upload()

Saving medicinedata.csv to medicinedata.csv


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
dataset_path = 'medicinedata.csv'
df = pd.read_csv(dataset_path)

# Select features and labels
features = df[[
    "Potency (%)",
    "Purity (%)",
    "Packaging_Compliance_Score (1-10)",
    "Predicted_Compliance_Score (%)",
    "Supplier_Reliability_Score (1-100)",
    "Regulatory_Benchmark_Score (%)",
    "Historical_Quality_Score (%)"
]]

labels = df["Anomaly_Flag"]

# Normalize the features
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    normalized_features, labels, test_size=0.2, random_state=42, stratify=labels
)


In [12]:
from tensorflow.keras.layers import Input, Dense, Lambda, BatchNormalization
from tensorflow.keras.models import Model
import tensorflow as tf

# Define the latent space dimension
latent_dim = 2

# Encoder
inputs = Input(shape=(7,), name="encoder_input")  # Updated input shape to match dataset
x = Dense(64, activation='relu')(inputs)
x = BatchNormalization()(x)
x = Dense(32, activation='relu')(x)
x = BatchNormalization()(x)

z_mean = Dense(latent_dim, name="z_mean")(x)
z_log_var = Dense(latent_dim, name="z_log_var")(x)

# Sampling function
def sampling(args):
    z_mean, z_log_var = args
    epsilon = tf.random.normal(shape=(tf.shape(z_mean)[0], latent_dim), mean=0., stddev=1.0)
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,), name="z")([z_mean, z_log_var])

# Decoder
decoder_inputs = Input(shape=(latent_dim,), name="decoder_input")
x = Dense(32, activation='relu')(decoder_inputs)
x = BatchNormalization()(x)
x = Dense(64, activation='relu')(x)
x = BatchNormalization()(x)
outputs = Dense(7, activation='sigmoid', name="decoder_output")(x)  # Updated output shape to match dataset

# Encoder and Decoder models
encoder = Model(inputs, [z_mean, z_log_var, z], name="encoder")
decoder = Model(decoder_inputs, outputs, name="decoder")

# Full VAE model
vae_outputs = decoder(encoder(inputs)[2])

# Define custom VAE model with integrated loss
class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Reconstruction loss
        reconstruction_loss = tf.reduce_mean(tf.reduce_sum(tf.square(inputs - reconstructed), axis=1))
        # KL divergence loss
        kl_loss = -0.5 * tf.reduce_mean(tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1))
        self.add_loss(reconstruction_loss + kl_loss)
        return reconstructed

vae = VAE(encoder, decoder)
vae.compile(optimizer=tf.keras.optimizers.Adam())

# Train the model
vae.fit(X_train, X_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)


Epoch 1/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 136ms/step - loss: 0.7143 - val_loss: 0.7899
Epoch 2/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.6770 - val_loss: 0.7525
Epoch 3/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.6476 - val_loss: 0.6999
Epoch 4/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.6346 - val_loss: 0.7197
Epoch 5/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.6480 - val_loss: 0.7303
Epoch 6/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.6114 - val_loss: 0.7242
Epoch 7/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 0.6319 - val_loss: 0.7205
Epoch 8/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.6258 - val_loss: 0.7162
Epoch 9/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x7b9d6d01cfa0>

In [13]:
vae.save("vae_anomaly_detection_model.h5")

# Save scaler for deployment
with open("scaler.pkl", "wb") as f:
    pickle.dump({"scaler": scaler}, f)

print("Model and scaler saved successfully!")




Model and scaler saved successfully!


In [15]:
# Generate a separate test dataset with 7 features (to match the training dataset)
np.random.seed(42)
test_data = np.vstack([
    np.random.normal(loc=0.5, scale=0.1, size=(50, 7)),  # Normal data
    np.random.normal(loc=1.5, scale=0.2, size=(10, 7))   # Data with anomalies
])

# Normalize the test data using the same scaler
test_data = scaler.transform(test_data)

# Test the VAE on the new test dataset
reconstructed = vae.predict(test_data)
reconstruction_errors = np.mean(np.square(test_data - reconstructed), axis=1)

# Determine threshold for anomaly detection
threshold = np.percentile(reconstruction_errors, 95)  # Use 95th percentile

# Classify anomalies based on reconstruction error
predictions = (reconstruction_errors > threshold).astype(int)

# Identify anomalies and highlight tuples
anomalous_tuples = []
anomalous_attributes = []
for i, (original, recon) in enumerate(zip(test_data, reconstructed)):
    if predictions[i] == 1:  # If the instance is an anomaly
        attribute_diff = np.abs(original - recon)
        significant_attributes = np.where(attribute_diff > 0.2)[0]  # Threshold for significant deviation
        anomalous_tuples.append((i, original))
        anomalous_attributes.append((i, significant_attributes))

# Output results
print("Anomaly Detection Results:")
for i, attributes in anomalous_attributes:
    print(f"Instance {i} is an anomaly. Significant deviations in attributes: {attributes}")

# Save the test dataset and results
import pandas as pd
test_df = pd.DataFrame(test_data, columns=[f"Feature_{i+1}" for i in range(test_data.shape[1])])
test_df["Anomaly"] = predictions
test_df.to_csv("test_dataset_results.csv", index=False)

print("\nTest dataset results saved as 'test_dataset_results.csv'.")




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 809ms/step
Anomaly Detection Results:
Instance 2 is an anomaly. Significant deviations in attributes: [0 1 2 3 4 5 6]
Instance 23 is an anomaly. Significant deviations in attributes: [0 1 2 3 4 5 6]
Instance 31 is an anomaly. Significant deviations in attributes: [0 1 2 3 4 5 6]

Test dataset results saved as 'test_dataset_results.csv'.


In [17]:
# Function to evaluate a specific tuple
def evaluate_tuple(input_tuple):
    # Normalize the input tuple using the same scaler
    normalized_tuple = scaler.transform([input_tuple])

    # Reconstruct using the VAE
    reconstructed_tuple = vae.predict(normalized_tuple)
    reconstruction_error = np.mean(np.square(normalized_tuple - reconstructed_tuple))

    # Check if it's an anomaly
    is_anomalous = reconstruction_error > threshold
    significant_attributes = np.where(np.abs(normalized_tuple - reconstructed_tuple) > 0.2)[1]

    print("\nTuple Evaluation Results:")
    print(f"Input Tuple: {input_tuple}")
    print(f"Reconstruction Error: {reconstruction_error:.4f}")
    print(f"Anomaly Status: {'Anomalous' if is_anomalous else 'Normal'}")
    if is_anomalous:
        print(f"Significant Deviations in Attributes: {significant_attributes.tolist()}")

# Example usage for a specific tuple with 7 features
# Replace this with your actual features (ensuring 7 values)
sample_tuple = [0.6, 0.7, 0.5, 0.8, 0.4, 0.9, 0.5]
evaluate_tuple(sample_tuple)

# Example usage for a specific tuple
'''
'Chemical_Composition_Compliance (%): 60% normalized compliance.
Packaging_Compliance_Score (1-10): 70% normalized compliance.
Regulatory_Benchmark_Score (%): 50% normalized compliance.
Predicted_Compliance_Score (%): 80% normalized prediction.
Potency (%): 40% normalized strength.
Purity (%): 90% normalized quality.
Supplier_Reliability_Score (1-100): 50% normalized reliability..
'''


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step

Tuple Evaluation Results:
Input Tuple: [0.6, 0.7, 0.5, 0.8, 0.4, 0.9, 0.5]
Reconstruction Error: 67.8078
Anomaly Status: Normal




"\n'Chemical_Composition_Compliance (%): 60% normalized compliance.\nPackaging_Compliance_Score (1-10): 70% normalized compliance.\nRegulatory_Benchmark_Score (%): 50% normalized compliance.\nPredicted_Compliance_Score (%): 80% normalized prediction.\nPotency (%): 40% normalized strength.\nPurity (%): 90% normalized quality.\nSupplier_Reliability_Score (1-100): 50% normalized reliability..\n"