In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from keras import layers, models
from keras.optimizers import Adam

In [None]:
# -extract data-
data = pd.read_csv('C:\\Users\\Lenovo\\Desktop\\Cancer_Data.csv')
data = data.dropna(axis=1, how='any')
data = data.drop('id', axis=1)
data = data[data['diagnosis'] != 'B']
print(data)
#print(data.head())

In [None]:
# -normalize-
#scaler = StandardScaler()
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

# -training and testing sets-
X_train, X_test = train_test_split(data_scaled, test_size=0.2, random_state=42)

# -autoencoder architecture-
input_dim = X_train.shape[1]

# -encoder-
input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(64, activation='relu')(input_layer)
encoded = layers.Dense(32, activation='relu')(encoded)
encoded = layers.Dense(32, activation='relu')(encoded)
latent_space = layers.Dense(8, activation='relu')(encoded)

# -decoder-
decoded = layers.Dense(32, activation='relu')(latent_space)
decoded = layers.Dense(32, activation='relu')(decoded)
decoded = layers.Dense(64, activation='relu')(decoded)
output_layer = layers.Dense(input_dim, activation='sigmoid')(decoded)

# -model-
autoencoder = models.Model(inputs=input_layer, outputs=output_layer)
learning_rate = 0.01  
optimizer = Adam(learning_rate=learning_rate)
autoencoder.compile(optimizer='adam', loss='mse')

In [None]:
# -training-
autoencoder.fit(X_train, X_train, epochs=200, batch_size=16, validation_data=(X_test, X_test), verbose=1)

reconstructed = autoencoder.predict(X_test)
reconstruction_errors = np.mean(np.square(X_test - reconstructed), axis=1)

In [None]:
r2 = r2_score(X_test, reconstructed)
print(f"R² Score: {r2}")

In [None]:
# -threshold for anomalies-
threshold = np.percentile(reconstruction_errors, 95)  #top 5%
anomalies = reconstruction_errors > threshold
print("Anomalies detected:", np.sum(anomalies))

anomalous_data = X_test[anomalies]
print("Anomalous data points:")
print(anomalous_data)

anomaly_indices = np.where(anomalies)[0]  
print("Indices of anomalies:", anomaly_indices)

In [None]:
# -plot-
plt.hist(reconstruction_errors, bins=30)
plt.xlabel("Reconstruction error")
plt.ylabel("Frequency")
plt.title("Reconstruction Error Distribution")
plt.axvline(threshold, color='red', linestyle='--', label='Threshold')
plt.legend()
plt.savefig('Breast Cancer Anomaly Detection.png')
plt.show()