# Installing Libraries

In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ML Model Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import IsolationForest
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional as F
from sklearn.tree import DecisionTreeClassifier

# Warnings
import warnings
warnings.filterwarnings('ignore')

# For GAN Implementation 
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# For AWS 
import boto3
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

# Intro to Data

In [None]:
import urllib.request

# URL to the dataset
url = "https://www.kaggle.com/api/v1/datasets/download/rupakroy/online-payments-fraud-detection-dataset"

# Destination path to save the file
destination = 'data_set.zip'

# Download the file
urllib.request.urlretrieve(url, destination)

print(f"File downloaded successfully and saved as {destination}")

In [None]:
!mkdir data
!unzip data_set.zip -d data/

In [None]:
df = pd.read_csv('data/Data_Set.csv')
df.head()

In [None]:
print("Number of rows and columns:", df.shape)


In [None]:
#describing the data
df.describe().T

In [None]:
df.dtypes

# Handling the NULL Values

In [None]:
if df.isnull().values.any():
    print('There are some missing values in this dataset\n')
    df.dropna(inplace=True)
    print('Shape : ', df.shape) 
else:
    print('GREAT, There is no missing values in this dataset')

# EDA of categories of the "type" feature


The feature 'type' contains the classes which is essential for the model to classify the data into the right class. So Label Encoding is applied on 'type' feature

In [None]:
print("Labels in feature type :- ",df['type'].unique())


In [None]:
print("The counts of each category :- ",df['type'].value_counts())
plt.figure(figsize=(10,10))
plt.title('type vs counts')
sns.countplot(data=df, x='type', color='skyblue')
plt.xlabel('Type')
plt.ylabel('Counts')
plt.grid(axis='y', alpha=1)
plt.show()

<h2>Label Encoding</h2>

In [None]:
df['type'].replace({'CASH_OUT':0, 'PAYMENT':1, 'CASH_IN':2, 'TRANSFER':3, 'DEBIT':4}, inplace=True)
df['type'].value_counts()

In [None]:
df.head(5)

# Drop unnecessary columns

The column mentioned below were dropped as these features don't add any information during the training phase of the model 

In [None]:
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)
df.columns

# EDA of categories of the "isFraud" feature

In [None]:
print("The class in 'isFraud' feature :- ",df['isFraud'].unique())

The class 0 represents normal transactions whereas 1 represents fraudulent transactions.

In [None]:
df['isFraud'].value_counts()

From the above summary we can see that the data is highly imbalanced, where 0 represents a normal transaction and 1 represents an anomaly.
As we have more than 6.3 million records with imbalanced data, we don't want to handle too much data.So, we are limiting the data that we access.

In [None]:
# Get all fraud cases (class 1)
fraud_df = df[df['isFraud'] == 1]

# Randomly sample 10000 non-fraud cases (class 0)
non_fraud_df = df[df['isFraud'] == 0].sample(n=10000, random_state=42)

# Combine both dataframes
df = pd.concat([fraud_df, non_fraud_df])

# Shuffle the combined dataframe
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df['isFraud'].value_counts()

# Standardization

In [None]:
scaling_columns = ['amount', 'oldbalanceOrg', 'newbalanceDest']

# Standardizing the numerical columns for consistent scaling
scaler = StandardScaler()
df[scaling_columns] = scaler.fit_transform(df[scaling_columns])

# Check the transformed data
print(df.head())


# GAN implementation for Class Immbalance

What is a GAN?
A Generative Adversarial Network (GAN) consists of two neural networks:

    Generator: Creates fake (synthetic) data that resembles the real data.
    Discriminator: Tries to distinguish between real data (from the dataset) and fake data (from the generator).

They compete in a "game" where:

    The Generator improves at creating realistic data.
    The Discriminator gets better at identifying fake data.

Over time, the generator learns to produce data indistinguishable from the real dataset.

Leaky ReLU is a powerful activation function that improves upon the traditional ReLU by allowing small negative values to pass through. 

Optimizer is like a smart coach that helps neural networks learn better by adjusting their parameters (weights and biases) to minimize errors during training.

Adam optimizer is a popular and efficient type of optimizer that:

    Adapts the learning rate automatically for each parameter
    Combines the best aspects of two other optimization methods (RMSprop and Momentum)
    Works well for most deep learning tasks without requiring much tuning

In [None]:
# Define the Generator class
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),  # First layer
            nn.LeakyReLU(0.2),  # Activation function
            nn.Linear(128, 256),  # Second layer
            nn.LeakyReLU(0.2),
            nn.Linear(256, output_dim),  # Output layer
            nn.Tanh()  # Ensures the generated output is in the range [-1, 1]
        )

    def forward(self, z):
        return self.model(z)

# Define the Discriminator class
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),  # Input layer
            nn.LeakyReLU(0.2),  # Activation function
            nn.Dropout(0.3),  # Regularization to prevent overfitting
            nn.Linear(256, 128),  # Hidden layer
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),  # Output layer
            nn.Sigmoid()  # Ensures the output is between 0 and 1 for binary classification
        )

    def forward(self, x):
        return self.model(x)

In [None]:
# Get only fraud data
fraud_data = df[df['isFraud'] == 1]

# Hyperparameters
latent_dim = 100  # Size of the random noise input for the generator
input_dim = df.shape[1]  # The number of features in the data

# Initialize the generator and discriminator
generator = Generator(latent_dim, input_dim)
discriminator = Discriminator(input_dim)

# Loss function and optimizers
adversarial_loss = nn.BCELoss()  # Binary Cross-Entropy loss
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))  # Adam optimizer for generator
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))  # Adam optimizer for discriminator


<details>
<summary>GAN Training Process Overview</summary>

# GAN Training Process Overview

## Introduction
This code implements a Generative Adversarial Network (GAN) training process.

## Model Setup
- Uses Binary Cross-Entropy loss (BCELoss) for measuring how well the model is performing
- Creates two optimizers using Adam algorithm - one for the Generator and one for the Discriminator
- Learning rate is set to 0.0002 with specific beta parameters for stability

## Training Process
### The train_gan function:
- Takes number of epochs and batch size as inputs
- Runs in a loop for the specified number of epochs

### In each epoch:
#### Discriminator Training:
- Gets real data samples from dataset
- Creates fake data using the Generator
- Trains to correctly identify real data as 1 and fake data as 0
- Updates Discriminator weights based on its performance

#### Generator Training:
- Creates fake data
- Tries to fool the Discriminator by making fake data look real
- Updates Generator weights based on how well it fooled the Discriminator

## Monitoring
### Progress Tracking:
- Every 10 epochs, prints out the losses for both Generator and Discriminator

## Analogy
Think of it like a game where:
- Generator is an art forger trying to create fake paintings
- Discriminator is an art expert trying to spot the fakes
- They keep getting better at their jobs by competing with each other

## Outcome
This process helps create a model that can generate realistic-looking synthetic data similar to your original dataset.
</details>

In [None]:
def train_gan(epochs, batch_size):
    for epoch in range(epochs):
        # Train the Discriminator
        optimizer_D.zero_grad()

        # Real data - filter only fraud cases
        fraud_df = df[df['isFraud'] == 1]
        fraud_indices = np.random.randint(0, fraud_df.shape[0], batch_size)
        real_data = torch.tensor(fraud_df.values[fraud_indices], dtype=torch.float)
        real_labels = torch.ones(batch_size, 1)  # Labels for real data (1)

        # Fake data
        noise = torch.randn(batch_size, latent_dim)
        fake_data = generator(noise)  # Generate fake data using the generator
        fake_labels = torch.zeros(batch_size, 1)  # Labels for fake data (0)

        # Calculate discriminator loss on real and fake data
        real_loss = adversarial_loss(discriminator(real_data), real_labels)
        fake_loss = adversarial_loss(discriminator(fake_data.detach()), fake_labels)
        d_loss = (real_loss + fake_loss) / 2

        # Backpropagation for discriminator
        d_loss.backward()
        optimizer_D.step()

        # Train the Generator
        optimizer_G.zero_grad()

        # Generate new fake data and try to fool the discriminator
        g_loss = adversarial_loss(discriminator(fake_data), real_labels)

        # Backpropagation for generator
        g_loss.backward()
        optimizer_G.step()

        # Print progress every 10 epochs
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Discriminator Loss: {d_loss.item()}, Generator Loss: {g_loss.item()}')


In [None]:
# Train the GAN for 1000 epochs (adjust the number of epochs based on your computational resources)
train_gan(epochs=1000, batch_size=64)

# Generate synthetic data without inverse scaling for now
def generate_synthetic_data(generator, num_samples):
    noise = torch.randn(num_samples, latent_dim)
    synthetic_data = generator(noise).detach().numpy()
    return synthetic_data  # Skipping inverse scaling for now

# Generate 1000 synthetic samples
synthetic_data = generate_synthetic_data(generator, num_samples=1787)
print(synthetic_data[:5])  # Display the first 5 synthetic samples

# Check the shape
print("Shape of synthetic data:", synthetic_data.shape)

In [None]:
# Convert the synthetic data into a DataFrame
synthetic_df = pd.DataFrame(synthetic_data, columns=df.columns)

# Label the synthetic data as fraudulent
synthetic_df['isFraud'] = 1  # Assuming all synthetic data represents fraudulent transactions

# Now combine the real and synthetic datasets
df = pd.concat([df, synthetic_df], ignore_index=True)

# Check the distribution of the target variable to see if the dataset is balanced
print(df['isFraud'].value_counts())  # This will show the number of fraudulent and non-fraudulent samples

# Feature Extraction


In [None]:
print("Co-relation Matrix :- ")
plt.figure(figsize=(8, 8))
sns.heatmap(df.corr(), 
            annot=True, 
            linewidths=0.9, 
            fmt=".1f", vmin=-1, vmax=1,
            cmap='coolwarm')
plt.show()

# Dropping unnecessary columns based on correlation

In [None]:
df['isFraud'].value_counts()

In [None]:
df.drop(['newbalanceOrig', 'oldbalanceDest'], axis=1, inplace=True)
df.columns

In [None]:
df.info()

## Isolation Forest

Isolation Forest is a machine learning algorithm used mainly for anomaly detection (finding unusual or "outlier" data points that don't fit well with the rest of the data). It's especially helpful when you want to identify rare, unexpected events—like fraud detection or equipment failure prediction.

### How It Works:
1. **Random Splits**: The algorithm creates multiple decision trees by randomly splitting the data. Each split separates the data into smaller groups.
2. **Isolation of Anomalies**: Outliers (anomalies) are more isolated and separated from the majority of data points, so they need fewer splits to become "isolated" in a branch of a tree.
3. **Scoring**: The algorithm calculates a score based on how quickly a data point is isolated in a tree. Points that are isolated with fewer splits are considered more likely to be anomalies.

### What Is the Anomaly Score?
The anomaly score is a value between 0 and 1 that tells us how "isolated" a point is compared to the rest of the data. A higher anomaly score means the data point is more likely to be an anomaly (unusual or rare), while a lower score means it's similar to other points in the dataset.

### Score Calculation Formula:
![Anomaly Score Formula](Images\anomaly_score.png)

#### Where:
- `E(h(x))`: The average path length of x across the trees
- `c(n)`: It's a scaling factor to make scores easier to interpret, regardless of dataset size.It just adjusts the path length to account for the number of points in your dataset. This way, the anomaly score stays consistent even if you have a very large or small dataset.Without c(n), scores could vary too much with different dataset sizes, making it hard to interpret them reliably.

  <details>
  <summary>Examples of Anomaly Score Calculation</summary>

  Example 1: Small Dataset (10 Points)
  Suppose you have a small dataset of 10 points. You want to calculate the anomaly score for a specific point, Point A.

  Path Length: Let's say it takes 3 splits on average to isolate Point A in this small dataset.

  Calculate c(n):
  For a dataset of 10 points, c(10) will be a smaller number—around 4.5 (since there are fewer points to split).

  Anomaly Score:
  s(x,n) = 2^(-3/4.5) ≈ 2^(-0.67) ≈ 0.63

  Example 2: Large Dataset (1000 Points)
  Now, let's say you have a much larger dataset with 1000 points and you want to check the same point, Point A.

  Path Length: In this large dataset, Point A might take 8 splits on average to be isolated (since there are more points to go through).

  Calculate c(n):
  For a dataset of 1000 points, c(1000) will be around 14.5 (larger because there are more points).

  Anomaly Score:
  s(x,n) = 2^(-8/14.5) ≈ 2^(-0.55) ≈ 0.68

  Why c(n) Matters:
  Without c(n), scores would vary a lot based on the dataset size.
  Here, Point A gets similar scores (around 0.63 and 0.68) in both datasets, thanks to c(n).
  </details>

### Interpreting the Anomaly Score:
- Score close to 1: Likely an anomaly
- Score close to 0: Likely a normal point

In [None]:
model = IsolationForest(contamination=0.05, random_state=42)
model.fit(df[['amount', 'oldbalanceOrg']])

df['anomaly'] = model.predict(df[['amount', 'oldbalanceOrg']])

normal_data = df[df['anomaly'] == 1]
fraud_data = df[df['anomaly'] == -1]


<h3>Scatter Plot<h3>

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(normal_data['amount'], normal_data['oldbalanceOrg'], 
            c='blue', label='Normal', alpha=0.5)
plt.scatter(fraud_data['amount'], fraud_data['oldbalanceOrg'], 
            c='red', label='Fraud', alpha=0.5)
plt.title("Isolation Forest: Normal vs Fraud Transactions")
plt.xlabel("Amount")
plt.ylabel("Old Balance Org")
plt.legend()
plt.show()

<h3>3D Scatter Plot</h3>

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(normal_data['amount'], normal_data['oldbalanceOrg'], normal_data['newbalanceDest'], 
           c='blue', label='Normal', alpha=0.5)
ax.scatter(fraud_data['amount'], fraud_data['oldbalanceOrg'], fraud_data['newbalanceDest'], 
           c='red', label='Fraud', alpha=0.5)

ax.set_xlabel('Amount')
ax.set_ylabel('Old Balance Org')
ax.set_zlabel('New Balance Dest')
plt.title("3D Scatter Plot: Normal vs Fraud Transactions")
plt.legend()
plt.show()


 # Splitting the data between train and target

In [None]:
X = np.array(df[["type", "amount", "oldbalanceOrg", "newbalanceDest"]])
y = np.array(df["isFraud"])

x_train, x_val, y_train, y_val = train_test_split(
    X, 
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

In [None]:
print('X_train shape is ' , x_train.shape)
print('X_test shape is ' , x_val.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_val.shape)

# Moving Data into s3 Bucket 

In [None]:
s3_client = boto3.client('s3')

bucket_name = 'anomaly-detection-bucket'


train_df = pd.DataFrame(x_train, columns=["type", "amount", "oldbalanceOrg", "newbalanceDest"])
train_df['isFraud'] = y_train

val_df = pd.DataFrame(x_val, columns=["type", "amount", "oldbalanceOrg", "newbalanceDest"])
val_df['isFraud'] = y_val

In [None]:
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)

# Upload to S3
s3_client.upload_file(
    'train_data.csv', 
    bucket_name, 
    'data/train/train_data.csv'
)

s3_client.upload_file(
    'val_data.csv', 
    bucket_name, 
    'data/val/val_data.csv'
)

In [None]:
train_data = 's3://{}/{}'.format(bucket_name, 'data/train')
val_data = 's3://{}/{}'.format(bucket_name, 'data/val')

train_channel = sagemaker.session.s3_input(train_data,content_type='text/csv')
val_channel = sagemaker.session.s3_input(val_data,content_type='text/csv')

In [None]:
data_channels = {
    'train': train_channel,
    'validation': val_channel
}

# Training the Cassification Model

In [None]:
key = 'model/decision_tree_model'
s3_output_location = f's3://{bucket_name}/{key}'

# Create an SKLearn Estimator for Decision Tree
dt_model = SKLearn(
    entry_point='train.py',  # Your training script
    role=get_execution_role(),  # IAM role for accessing resources
    instance_count=1,  # Number of training instances
    instance_type='ml.t2.medium',  # Type of instance - free tier eligible
    framework_version='0.23-1',  # Version of Scikit-learn
    output_path=s3_output_location,  # Location to save model artifacts
    sagemaker_session=sagemaker.Session(), # SageMaker session
    train_use_spot_instances=True,
    train_max_run=300,
    train_max_wait=600,
)

# Set Hyperparameters
dt_model.set_hyperparameters(
    criterion='gini',  # Splitting criterion: "gini" or "entropy"
    max_depth=5,  # Maximum depth of the tree
    min_samples_split=2,  # Minimum samples required to split a node
    min_samples_leaf=1,  # Minimum samples required to be a leaf node
    random_state=42  # Seed for reproducibility
)

# Fit the model with the dataset
dt_model.fit(inputs=data_channels)

# Deploy the Model


In [None]:
predictor = dt_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium'
)

# Test the endpoint
response = predictor.predict([{'feature1': 5, 'feature2': 2, 'feature3': 7}])
print(response)


# Model Analysis

In [None]:
# Define the SageMaker endpoint name
endpoint_name = "your-endpoint-name"

# Load validation data (local file or from S3)
val_data = pd.read_csv('/path/to/validation_data.csv')  # Replace with your validation data path
X_val = val_data.drop('target', axis=1)  # Replace 'target' with your actual target column name
y_val = val_data['target']

# Initialize the SageMaker runtime client
runtime_client = boto3.client('sagemaker-runtime')

In [None]:
def get_predictions_from_endpoint(endpoint_name, data):
    predictions = []
    for _, row in data.iterrows():
        payload = row.values.tolist()
        response = runtime_client.invoke_endpoint(
            EndpointName=endpoint_name,
            ContentType="text/csv",  # Ensure this matches your endpoint's expected input
            Body=",".join(map(str, payload))
        )
        predictions.append(float(response['Body'].read().decode('utf-8')))
    return np.array(predictions)

# Get predictions
y_pred = get_predictions_from_endpoint(endpoint_name, X_val)

In [None]:
print("\nClassification Report:")
print(classification_report(y_val, y_pred))


In [None]:
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_val), yticklabels=np.unique(y_val))
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

# Predicting some new values 

In [None]:
# Lets predict the transactions
features = np.array([[4, 9000.60, 9000.60, 0.00]])
print(model.predict(features))

In [None]:
features = np.array([[2, 9839.64, 170136.00, 160296.36]])
print(model.predict(features))