In [16]:
!pip install numpy pandas opencv-python tensorflow pycryptodome scikit-learn matplotlib tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [70]:
import numpy as np
import pandas as pd

data = {
    "ID": np.arange(1, 100001),
    "Name": [f"User_{i}" for i in range(1, 100001)],
    "SSN": np.random.randint(100000000, 999999999, 100000),
    "Salary": np.random.randint(40000, 120000, 100000),
    "Credit_Score": np.random.randint(300, 850, 100000),
    "Is_Leaked": np.random.choice([0, 1], size=100000, p=[0.95, 0.05])  # 5% are leaks
}

df = pd.DataFrame(data)
df.to_csv("sensitive_data_large.csv", index=False)
df.head()

Unnamed: 0,ID,Name,SSN,Salary,Credit_Score,Is_Leaked
0,1,User_1,423614846,79245,502,0
1,2,User_2,771612094,65679,518,0
2,3,User_3,381644090,47631,719,0
3,4,User_4,573099529,43940,478,0
4,5,User_5,861316114,107414,722,0


In [72]:
def inject_fake_data(df, num_fake=1000):
    fake_entries = df.sample(num_fake).copy()
    fake_entries["ID"] += 1000000 
    fake_entries["Name"] = ["Fake_User_" + str(i) for i in range(1, num_fake + 1)]
    fake_entries["Is_Leaked"] = 1 
    
    df = pd.concat([df, fake_entries], ignore_index=True)
    df.to_csv("sensitive_data_with_fakes.csv", index=False)
    print(f"Injected {num_fake} fake records.")
    return df

df_fake = inject_fake_data(df)
df_fake.tail(10)

Injected 1000 fake records.


Unnamed: 0,ID,Name,SSN,Salary,Credit_Score,Is_Leaked
100990,1057484,Fake_User_991,363958632,50603,842,1
100991,1037649,Fake_User_992,862727330,80307,837,1
100992,1022680,Fake_User_993,513341387,106496,354,1
100993,1072359,Fake_User_994,229064921,40013,492,1
100994,1008293,Fake_User_995,397979546,66220,305,1
100995,1024604,Fake_User_996,136041075,110700,565,1
100996,1046120,Fake_User_997,642381600,50009,613,1
100997,1027862,Fake_User_998,225821852,81824,453,1
100998,1056233,Fake_User_999,436852506,76508,767,1
100999,1078440,Fake_User_1000,618710913,118312,727,1


In [74]:
from Crypto.Cipher import AES
from Crypto.Random import get_random_bytes
import os

def pad_data(data):
    return data + b" " * (16 - len(data) % 16)  

def generate_key():
    key = get_random_bytes(32) 
    with open("encryption_key.key", "wb") as key_file:
        key_file.write(key)
    return key

def encrypt_data(file_path, output_path, key_path="encryption_key.key"):
    key = open(key_path, "rb").read()
    cipher = AES.new(key, AES.MODE_EAX)

    with open(file_path, "rb") as file:
        file_data = file.read()

    encrypted_data, tag = cipher.encrypt_and_digest(pad_data(file_data))

    with open(output_path, "wb") as encrypted_file:
        encrypted_file.write(cipher.nonce + encrypted_data)
    print(f"Encrypted file saved at {output_path}")

key = generate_key()
encrypt_data("sensitive_data_with_fakes.csv", "encrypted_data.csv")


Encrypted file saved at encrypted_data.csv


In [76]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("sensitive_data_with_fakes.csv")

X = df[["Salary", "Credit_Score"]].values
y = df["Is_Leaked"].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = keras.Sequential([
    keras.layers.Dense(64, activation="relu", input_shape=(X_train.shape[1],)),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

history = model.fit(X_train, y_train, epochs=10, batch_size=256, validation_data=(X_test, y_test))

accuracy = model.evaluate(X_test, y_test)[1] * 100
print(f"Leak Detection Accuracy: {accuracy:.2f}%")


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 778us/step - accuracy: 0.9401 - loss: 0.3097 - val_accuracy: 0.9420 - val_loss: 0.2245
Epoch 2/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 609us/step - accuracy: 0.9412 - loss: 0.2258 - val_accuracy: 0.9420 - val_loss: 0.2219
Epoch 3/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 629us/step - accuracy: 0.9413 - loss: 0.2239 - val_accuracy: 0.9420 - val_loss: 0.2219
Epoch 4/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 664us/step - accuracy: 0.9403 - loss: 0.2264 - val_accuracy: 0.9420 - val_loss: 0.2221
Epoch 5/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 616us/step - accuracy: 0.9402 - loss: 0.2269 - val_accuracy: 0.9420 - val_loss: 0.2219
Epoch 6/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 626us/step - accuracy: 0.9415 - loss: 0.2230 - val_accuracy: 0.9420 - val_loss: 0.2226
Epoch 7/10
[1m316/316[0m 

In [82]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211us/step
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     19028
           1       0.00      0.00      0.00      1172

    accuracy                           0.94     20200
   macro avg       0.47      0.50      0.49     20200
weighted avg       0.89      0.94      0.91     20200

[[19028     0]
 [ 1172     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [84]:
import time

start_time = time.time()
_ = model.predict(X_test)
detection_time = time.time() - start_time

print(f"Leak Detection Time: {detection_time:.4f} seconds")

[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221us/step
Leak Detection Time: 0.2432 seconds
