In [None]:
# Install dependencies if missing (uncomment if needed)
# %pip install tensorflow pandas numpy scikit-learn joblib matplotlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Input
import joblib

In [None]:
# Suppress TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
print(f"TensorFlow Version: {tf.__version__}")

In [None]:
# load datasets
data_dir = '../data/processed/'
ag_df = pd.read_csv(os.path.join(data_dir, 'data_set_AG_1.csv'))
mal_df = pd.read_csv(os.path.join(data_dir, 'data_set_MAL_1.csv'))
# set train/test from the processed CSVs
train = ag_df.copy()
test = mal_df.copy()
train.shape, test.shape

# for filename in os.listdir(data_dir):
#     dataset = pd.read_csv(os.path.join(data_dir, filename), sep='\t')
#     dataset_mean_abs = np.array(dataset.abs().mean())
#     dataset_mean_abs = pd.DataFrame(dataset_mean_abs.reshape(1,4))
#     dataset_mean_abs.index = [filename]
#     # train = train.append(dataset_mean_abs)
#     train = pd.concat([train, dataset_mean_abs])
    
# train.columns = ['Bearing 1', 'Bearing 2', 'Bearing 3', 'Bearing 4']
# #train.head()

In [None]:
# Select features (excluding non-numeric cols like 'type', 'sender', 'to')
numeric_cols = ['approveAmount', 'transferAmount', 'transferFromAmount', 
                'oldApproveState', 'newApproveState', 'oldBalanceState', 
                'newBalanceState', 'success'] # 'success' is boolean, convert to int

# Clean Data
train_data = ag_df[numeric_cols].copy()
train_data['success'] = train_data['success'].astype(int)

# Normalize Data (Crucial for Neural Networks)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train_data)
print(f"Data Scaled. Shape: {X_train.shape}")

In [None]:
train

In [None]:
from_cumm = dict()
print(from_cumm)


In [None]:
type_replace = {"transfer": float(0), "transferFrom": float(1), "approve": float(2)}
train_type_replaced = train.replace(type_replace)

In [None]:
train_type_replaced.dtypes

In [None]:
train = train_type_replaced.drop(columns=['to', 'sender'], axis=1)

In [None]:
train

In [None]:
train.dtypes

In [None]:
# transform data file index to datetime and sort in chronological order
# train.index = pd.to_datetime(train.index, format='%Y.%m.%d.%H.%M.%S')
# train = train.sort_index()
# train.to_csv('Averaged_BearingTest_Dataset.csv')
# print("Dataset shape:", train.shape)
train.head()

In [None]:
fig, ax = plt.subplots(figsize=(14, 6), dpi=80)
ax.plot(train['type'], label='type', color='blue', animated = True, linewidth=1)
ax.plot(train['approveAmount'], label='approveAmount', color='red', animated = True, linewidth=1)
ax.plot(train['transferAmount'], label='transferAmount', color='green', animated = True, linewidth=1)
ax.plot(train['transferFromAmount'], label='transferFromAmount', color='black', animated = True, linewidth=1)
plt.legend(loc='lower left')
# ax.set_title('Bearing Sensor Training Data', fontsize=16)
plt.show()

In [None]:
# transforming data from the time domain to the frequency domain using fast Fourier transform
# train_fft = np.fft.fft(train)


In [None]:
# fig, ax = plt.subplots(figsize=(14, 6), dpi=80)
# ax.plot(train_fft[:,0].real, label='Bearing 1', color='blue', animated = True, linewidth=1)
# ax.plot(train_fft[:,1].imag, label='Bearing 2', color='red', animated = True, linewidth=1)
# ax.plot(train_fft[:,2].real, label='Bearing 3', color='green', animated = True, linewidth=1)
# ax.plot(train_fft[:,3].real, label='Bearing 4', color='black', animated = True, linewidth=1)
# plt.legend(loc='lower left')
# ax.set_title('Bearing Sensor Training Frequency Data', fontsize=16)
# plt.show()

In [None]:
# normalize the data

from sklearn.preprocessing import MinMaxScaler
import joblib
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train)
# X_test = scaler.transform(test)
scaler_filename = "scaler_data"
joblib.dump(scaler, scaler_filename)

In [None]:
X_train.shape

In [None]:
def generate_sequence(data, sequence_length):
	seq_data = []
	# print(seq_data)
	for i in range(len(data)- sequence_length+1):
		seq = data[ i : i+sequence_length ]
		# print(seq)
		seq_data.append(seq)

	# print(seq_data)
	return np.array(seq_data)

In [None]:
SEQUENCE_LENGTH = 20

a = generate_sequence(X_train, SEQUENCE_LENGTH)

In [None]:
X_train.shape

In [None]:
print(a.shape)

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Input, Dense, RepeatVector, TimeDistributed
CuDNNLSTM = LSTM

In [None]:
model = Sequential()
model.add(Input(shape=(a.shape[1], a.shape[2])))
model.add(CuDNNLSTM(64, return_sequences=True  ))
model.add(CuDNNLSTM(SEQUENCE_LENGTH, return_sequences=False ))
model.add(Dense(SEQUENCE_LENGTH))
model.add(RepeatVector(SEQUENCE_LENGTH))
model.add(CuDNNLSTM(SEQUENCE_LENGTH, return_sequences=True ))
model.add(CuDNNLSTM(64, return_sequences=True  ))
model.add(TimeDistributed(Dense(a.shape[2])))

In [None]:
model.compile(optimizer='adam', loss='mae')
model.summary()

In [None]:
history = model.fit(a, a, epochs=3, batch_size=1, validation_split=0.05, ).history

In [None]:
fig, ax = plt.subplots(figsize=(14, 6), dpi=80)
ax.plot(history['loss'], 'b', label='Train', linewidth=2)
ax.plot(history['val_loss'], 'r', label='Validation', linewidth=2)
ax.set_title('Model loss', fontsize=16)
ax.set_ylabel('Loss (mae)')
ax.set_xlabel('Epoch')
ax.legend(loc='upper right')
plt.show()

In [None]:
# use the already-loaded `mal_df` from processed data
test = mal_df.copy()

In [None]:
test_type_replaced = test.replace(type_replace)

In [None]:
test_type_replaced.dtypes

In [None]:
test = test_type_replaced.drop(columns=['to', 'sender'], axis=1)

In [None]:
test.dtypes

In [None]:
X_test = scaler.transform(test)

In [None]:
X_test.shape

In [None]:
b = generate_sequence(X_test, SEQUENCE_LENGTH)

In [None]:
print(b.shape)

In [None]:
b_pred = model.predict(b)

In [None]:
b.shape

In [None]:
b_pred.shape

In [None]:
loss = np.mean(np.abs(b_pred-b), axis = 1)

In [None]:
loss.shape

In [None]:
combined_loss_b = np.sum(loss, axis=1)

In [None]:
# natural numbering of the bearings
# BEARINGNUMBER = 1
# BEARINGNUMBER_WHOLE = BEARINGNUMBER - 1
fig, ax = plt.subplots(figsize=(14, 6), dpi=80)

ax.plot(combined_loss_b, 'r', label=f'combined_loss_b', linewidth=1)

ax.set_title('malicious data reconstruction loss', fontsize=16)
ax.set_ylabel('Loss (mae)')
ax.set_xlabel('TXn')
ax.legend(loc='upper right')
# plt.ylim(0, 1)
plt.show()

In [None]:
a_pred = model.predict(a)

In [None]:
loss_a = np.mean(np.abs(a_pred - a), axis = 1)

In [None]:
combined_loss_a = np.sum(loss_a, axis=1)

In [None]:
# loss_a[:, 1]

In [None]:
# natural numbering of the bearings
# BEARINGNUMBER = 1
# BEARINGNUMBER_WHOLE = BEARINGNUMBER - 1
fig, ax = plt.subplots(figsize=(14, 6), dpi=80)

ax.plot(combined_loss_a, 'g', label=f'combined_loss_a', linewidth=1)

ax.set_title('good data reconstruction loss', fontsize=16)
ax.set_ylabel('Loss (mae)')
ax.set_xlabel('TXn')
ax.legend(loc='upper right')
# plt.ylim(0, 1)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14, 6), dpi=80)

ax.plot(combined_loss_a, 'g', label=f'Benign Transactions', linewidth=1)
ax.plot(combined_loss_b, 'r', label=f'Malicious Transactions', linewidth=1)

ax.set_title('Malicious vs Benign reconstruction loss - sequence length=20', fontsize=16)
ax.set_ylabel('Loss (mae)')
ax.set_xlabel('TXn')
ax.legend(loc='upper right')
# plt.ylim(0, 1)
plt.show()

In [None]:
from tensorflow.keras.models import save_model
import joblib

# Save Model
model.save('../models/fraud_detection_model.h5')

# Save Scaler (If you re-trained the scaler)
# joblib.dump(scaler, '../models/scaler.pkl')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns

# --- 1. DETERMINE THRESHOLD ---
# We use the MAX loss of the training data (benign) as the baseline threshold.
# Any transaction with error higher than this is "Anomalous".
X_train_pred = model.predict(X_train_seq)
train_mae_loss = np.mean(np.abs(X_train_pred - X_train_seq), axis=1)
threshold = np.mean(train_mae_loss) + (2 * np.std(train_mae_loss)) # Standard statistical threshold (Mean + 2*SD)
print(f"Calculated Threshold: {threshold:.4f}")

# --- 2. EVALUATE ON "TEST" DATA (Malicious + Benign Mixed) ---
# Let's create a mixed test set to see how well it catches fraud
# (In a real scenario, you would keep a separate test set, but for this demo, we mix them)

# Create sequences for Malicious data
X_mal = scaler.transform(mal_df[numeric_cols])
X_mal_seq = create_sequences(X_mal, SEQUENCE_LENGTH)

# Ground Truth: 0 = Benign, 1 = Fraud
# We take a slice of benign data and all malicious data
n_benign = len(X_train_seq)
n_malicious = len(X_mal_seq)

# Combine Real Data
X_test_seq = np.concatenate([X_train_seq, X_mal_seq])
y_true = np.concatenate([np.zeros(n_benign), np.ones(n_malicious)])

# --- 3. RUN PREDICTION ---
X_test_pred = model.predict(X_test_seq)
test_mae_loss = np.mean(np.abs(X_test_pred - X_test_seq), axis=1)

# Logic: If Loss > Threshold -> Prediction is 1 (Fraud)
y_pred = [1 if e > threshold else 0 for e in np.mean(test_mae_loss, axis=1)]

# --- 4. PRINT METRICS ---
print("\n--- FINAL MODEL PERFORMANCE ---")
print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
print(f"Precision: {precision_score(y_true, y_pred):.4f}")
print(f"Recall:    {recall_score(y_true, y_pred):.4f}")
print(f"F1-Score:  {f1_score(y_true, y_pred):.4f}")

# --- 5. PLOT CONFUSION MATRIX ---
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Fraud'], yticklabels=['Benign', 'Fraud'])
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()