In [3]:
# needed libraries
import os
import pandas as pd
import numpy as np
import boto3
import sagemaker
import awswrangler as wr
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings

warnings.filterwarnings("ignore")

In [4]:
sm_boto3 = boto3.client("sagemaker")
sm_session = sagemaker.Session()

In [5]:
region = sm_session.boto_session.region_name

In [6]:
s3_client = boto3.client("s3", region_name=region)

In [7]:
s3_resource = boto3.resource("s3")

In [8]:
s3_bucket = sm_session.default_bucket()

In [9]:
"""
Note: if you are not running this notebook from SageMaker Studio or SageMaker Classic Notebooks you will need to instanatiate 
the sagemaker_execution_role_name with an AWS role that has SageMakerFullAccess and SageMakerFeatureStoreFullAccess
"""
sagemaker_execution_role_name = "new_role"
try:
    sagemaker_role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    sagemaker_role = iam.get_role(RoleName=sagemaker_execution_role_name)["Role"]["Arn"]
    print(f"\n instantiating sagemaker_role with supplied role name : {sagemaker_role}")

account_id = boto3.client("sts").get_caller_identity()["Account"]


 instantiating sagemaker_role with supplied role name : arn:aws:iam::593793065896:role/new_role


In [10]:
print(f"Region name {region}")
print(f"Using bucket {s3_bucket}")

Region name us-east-1
Using bucket sagemaker-us-east-1-593793065896


In [11]:
df = pd.read_csv(r"../data/paysim.csv")
print(df.shape)
df.head()

(6362620, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [13]:
df.count()

step              6362620
type              6362620
amount            6362620
nameOrig          6362620
oldbalanceOrg     6362620
newbalanceOrig    6362620
nameDest          6362620
oldbalanceDest    6362620
newbalanceDest    6362620
isFraud           6362620
isFlaggedFraud    6362620
dtype: int64

In [14]:
df.duplicated().sum()

0

In [15]:
df.nunique()

step                  743
type                    5
amount            5316900
nameOrig          6353307
oldbalanceOrg     1845844
newbalanceOrig    2682586
nameDest          2722362
oldbalanceDest    3614697
newbalanceDest    3555499
isFraud                 2
isFlaggedFraud          2
dtype: int64

In [16]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [17]:
clean_df = df.drop(columns=["nameOrig", "nameDest"])
clean_df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0


In [65]:
#for feature in clean_df.columns:
#    fig = px.histogram(data_frame=clean_df, x=feature, color="isFraud",
#                       barmode='group',
#                       title=f'Distribution of {feature}', nbins=5,
#                       text_auto=True)
#    fig.update_yaxes(type='log', title='Logarithmic Count')
#    fig.show()

In [19]:
def outlier_removal_iqr(data, column):
    Q1 = data[column].quantile(0.30)
    Q3 = data[column].quantile(0.70)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
def outlier_removal_zscore(data, column):
    mean = data[column].mean()
    std_dev = data[column].std()
    z_score_threshold = 3
    lower_bound = mean - z_score_threshold * std_dev
    upper_bound = mean + z_score_threshold * std_dev
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
def outlier_removal(data, column):
    data = outlier_removal_iqr(data, column)
    #data = outlier_removal_zscore(data, column)
    return data


In [20]:

cleaned_df = pd.DataFrame()

for feature in clean_df.columns.drop(["isFraud", "isFlaggedFraud", "type"]):

    cleaned_feature_data = outlier_removal(clean_df, feature)

    cleaned_data = pd.concat([cleaned_df, cleaned_feature_data], axis=0)

cleaned_df = cleaned_data.drop_duplicates()


cleaned_df = cleaned_data.reset_index(drop=True)

print(cleaned_df.shape)
cleaned_df.head()

(5377272, 9)


Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0


In [21]:
cleaned_df.isnull().sum()

step              0
type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [22]:
cleaned_df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,5377272.0,5377272.0,5377272.0,5377272.0,5377272.0,5377272.0,5377272.0,5377272.0
mean,243.256,119513.5,761429.5,781837.7,299785.8,354444.6,0.001268301,2.975486e-06
std,142.3943,182826.3,2757611.0,2790687.0,471512.5,510417.5,0.03559063,0.001724957
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,10885.76,0.0,0.0,0.0,0.0,0.0,0.0
50%,238.0,49921.26,15819.0,0.0,0.0,72911.77,0.0,0.0
75%,334.0,176887.4,105151.0,140746.9,442378.8,551223.3,0.0,0.0
max,743.0,10000000.0,59585040.0,49585040.0,13060830.0,2059199.0,1.0,1.0


In [23]:
correlation_matrix = cleaned_df.corr(numeric_only=True)

In [66]:
#plt.figure(figsize=(12, 8))
#sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.3f')
#plt.title('Correlation Matrix')
#plt.show()

In [25]:
cleaned_df = cleaned_df.drop(columns=["newbalanceOrig", "oldbalanceDest", "isFlaggedFraud"])
cleaned_df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,170136.0,0.0,0
1,1,PAYMENT,1864.28,21249.0,0.0,0
2,1,TRANSFER,181.0,181.0,0.0,1
3,1,CASH_OUT,181.0,181.0,0.0,1
4,1,PAYMENT,11668.14,41554.0,0.0,0


In [26]:
cleaned_df = cleaned_df[~((cleaned_df["oldbalanceOrg"] > 1) & (cleaned_df["newbalanceDest"] < 1))]
cleaned_df.reset_index(drop=True, inplace=True)

In [27]:
cleaned_df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceDest,isFraud
0,1,DEBIT,5337.77,41720.0,40348.79,0
1,1,DEBIT,9644.94,4465.0,157982.12,0
2,1,CASH_OUT,229133.94,15325.0,51513.44,0
3,1,DEBIT,9302.79,11299.0,16896.7,0
4,1,PAYMENT,9920.52,0.0,0.0,0


In [28]:
cleaned_df.shape

(3716587, 6)

In [29]:
# Probability Ratio Encoding
default_prob = cleaned_df[cleaned_df["isFraud"] == 1]["type"].value_counts() / cleaned_df["type"].value_counts()
cleaned_df["type_prob_ratio_encoded"] = cleaned_df["type"].map(default_prob)

In [30]:
cleaned_df = cleaned_df.drop(columns=["type"])
cleaned_df = cleaned_df.dropna()
cleaned_df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceDest,isFraud,type_prob_ratio_encoded
2,1,229133.94,15325.0,51513.44,0,0.001546
12,1,110414.71,26845.41,2415.16,0,0.001546
14,1,56953.9,1942.02,64106.18,0,0.001546
15,1,62610.8,79114.0,8383.29,0,9.2e-05
16,1,82940.31,3017.87,49864.36,0,0.001546


In [31]:
train_set, test_set = train_test_split(cleaned_df, test_size=0.3, random_state=42, stratify=cleaned_df["isFraud"])

In [32]:
print(train_set.shape)
print(test_set.shape)

(1431629, 6)
(613556, 6)


In [52]:
train_set.to_csv("../data/train_set.csv", index=False)
test_set.to_csv("../data/test_set.csv", index=False)

In [None]:
import os

train_file_exists = os.path.exists("train_set.csv")
test_file_exists = os.path.exists("test_set.csv")

print(f"Train file saved: {train_file_exists}")
print(f"Test file saved: {test_file_exists}")

Train file saved: True
Test file saved: True


In [56]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/fraud_det/sklearncontainer"
trainpath = sm_session.upload_data(
    path="train_set.csv", bucket=s3_bucket, key_prefix=sk_prefix
)

testpath = sm_session.upload_data(
    path="test_set.csv", bucket=s3_bucket, key_prefix=sk_prefix
)
print(trainpath)
print(testpath)

s3://sagemaker-us-east-1-593793065896/sagemaker/fraud_det/sklearncontainer/train_set.csv
s3://sagemaker-us-east-1-593793065896/sagemaker/fraud_det/sklearncontainer/test_set.csv


In [57]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.0-1"

sklearn_estimator = SKLearn(
    entry_point="fraud_detection.py",
    role="arn:aws:iam::593793065896:role/new_role",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="FD-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 42,
    },
    source_dir=".",
    dependencies=["../requirements.txt"],
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600,
    output_path="s3://sagemaker-us-east-1-593793065896/sagemaker/fraud_det/sklearncontainer/sagemaker_model_output"
)


In [58]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

2025-05-06 14:22:46 Starting - Starting the training job...
2025-05-06 14:23:23 Downloading - Downloading input data...
2025-05-06 14:23:54 Downloading - Downloading the training image......
2025-05-06 14:24:49 Training - Training image download completed. Training in progress..2025-05-06 14:24:54,067 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-05-06 14:24:54,070 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-05-06 14:24:54,073 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-05-06 14:24:54,090 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-05-06 14:24:58,257 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/miniconda3/bin/python -m pip install -r requirements.txt
Collecting sagemaker==2.185.0 (from -r requirements.txt (line 2))
  Downloading sagemaker-2.185.0.tar.gz (884 kB)
     ━━━━━━

In [59]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2025-05-06 14:32:44 Starting - Preparing the instances for training
2025-05-06 14:32:44 Downloading - Downloading the training image
2025-05-06 14:32:44 Training - Training image download completed. Training in progress.
2025-05-06 14:32:44 Uploading - Uploading generated training model
2025-05-06 14:32:44 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-593793065896/sagemaker/fraud_det/sklearncontainer/sagemaker_model_output/FD-custom-sklearn-2025-05-06-14-21-53-424/output/model.tar.gz


In [60]:
artifact

's3://sagemaker-us-east-1-593793065896/sagemaker/fraud_det/sklearncontainer/sagemaker_model_output/FD-custom-sklearn-2025-05-06-14-21-53-424/output/model.tar.gz'

In [61]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role="arn:aws:iam::593793065896:role/new_role",
    entry_point="fraud_detection.py",
    framework_version=FRAMEWORK_VERSION,
)

In [62]:
model_name

'Custom-sklearn-model-2025-05-06-14-33-36'

In [64]:
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name,
)

EndpointName=Custom-sklearn-model-2025-05-06-15-05-43


-

In [None]:
endpoint_name

In [None]:
features = test_set.drop(columns=["isFraud"]).columns

In [None]:
test_set[features][0:2].values.tolist()

In [None]:
print(predictor.predict(test_set[features][0:2].values.tolist()))