In [84]:
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import NearMiss
import sagemaker
import os
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.session import TrainingInput
from sagemaker.serializers import CSVSerializer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [64]:
# Define the S3 bucket and file path
bucket_name = 'creditcardfraud-amod5410h'
file_name = 'fraudNewTrain.csv'

# Create a S3 client
s3 = boto3.client('s3')

# Read the CSV file from S3 into a pandas DataFrame
obj = s3.get_object(Bucket=bucket_name, Key=file_name)
df = pd.read_csv(obj['Body'])

## Preprocessing the data

In [65]:
df.dropna(inplace=True)
df = df.drop(columns=["merchant", "first", "last","gender", "street", "city", "state", "job", "dob", "category", "trans_num", "transaction_id", "trans_date_trans_time"])
df["cc_num"] = df["cc_num"].astype(float)

In [66]:
df.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,6540980000000000.0,61.07,48221,42.426,-83.15,673342,1325596418,43.323709,-82.568814,0
1,630469000000.0,1433.54,97014,45.671,-121.8686,1288,1325413769,45.873314,-121.589038,0
2,639030000000.0,83.17,62970,37.1935,-89.0933,639,1325402816,37.502558,-89.538733,0
3,3501510000000000.0,109.43,34112,26.1184,-81.7361,276002,1325506321,26.002474,-81.985257,0
4,3540210000000000.0,1028.7,79085,34.7437,-102.5064,53,1329695787,35.054154,-101.666822,1


## Seprating the dataset into features and target

In [67]:
features = df.drop(columns=["is_fraud"])
target = df["is_fraud"]

## Splitting the dataset in training and validation

In [86]:
X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size=0.2, random_state=123,stratify=target)

#Initializing the class balancer
nm = NearMiss(version=1)
X_train, Y_train = nm.fit_resample(X_train, Y_train)

train = pd.concat([Y_train, X_train], axis=1)
val =  pd.concat([Y_val, X_val], axis=1)

In [87]:
train

Unnamed: 0,is_fraud,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
0,0,4.613310e+12,32.66,28611,35.9946,-81.7266,885,1325650099,36.435558,-81.375070
1,0,4.613310e+12,71.03,28611,35.9946,-81.7266,885,1325422300,35.773439,-80.895816
2,0,3.415460e+14,1.59,99783,64.7556,-165.6723,145,1325527398,64.561457,-165.832427
3,0,3.415460e+14,41.00,99783,64.7556,-165.6723,145,1325523579,64.842781,-166.288628
4,0,4.922710e+15,1.91,15665,40.3359,-79.6607,1472,1325391544,41.144024,-80.187862
...,...,...,...,...,...,...,...,...,...,...
2269,1,2.269770e+15,758.79,14510,42.6835,-77.8664,4895,1328653021,43.353285,-77.283250
2270,1,3.547340e+15,873.41,12508,41.5097,-73.9634,19880,1331595122,41.432231,-74.686801
2271,1,4.209700e+18,919.58,62935,37.8274,-88.6235,1943,1327630966,38.569555,-89.441146
2272,1,6.011150e+15,9.00,94015,37.6787,-122.4780,107941,1331780519,36.889644,-121.648145


In [88]:
val

Unnamed: 0,is_fraud,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
5002,0,3.506040e+15,72.76,59710,45.1939,-112.0568,286,1325595872,45.737668,-111.953723
3274,0,4.457730e+18,89.46,95629,38.5234,-120.6763,832,1325614041,38.001725,-121.617641
3483,0,5.816860e+11,41.81,16314,41.5205,-80.0573,5507,1325621993,42.478400,-80.887472
4375,0,3.764450e+14,2.49,69165,41.1558,-101.1360,1789,1325616027,41.196425,-101.218980
2158,0,6.762480e+11,80.46,33909,26.6939,-81.9452,156391,1325554682,26.768214,-82.805457
...,...,...,...,...,...,...,...,...,...,...
2085,0,3.042820e+13,85.51,17088,40.3087,-76.2963,823,1325405781,39.350022,-76.183697
245,0,3.583640e+15,167.47,19149,40.0369,-75.0664,1526206,1325527909,39.495175,-74.083593
2807,0,2.131270e+14,7.95,78040,27.5155,-99.4986,248858,1325419530,26.523461,-98.522729
1071,1,3.007470e+13,19.64,67851,37.9931,-100.9893,2691,1330469540,38.460429,-100.779957


## Storing the dataset in a bucket

In [89]:
train.to_csv('train.csv', index=False, header=False)
val.to_csv('validation.csv', index=False, header=False)

prefix = "sagemaker-xgboost-creditcard-fraud-prediction"
bucket = sagemaker.Session().default_bucket()

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/validation.csv')).upload_file('validation.csv')

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


## Initailzing the Xgboost model

In [90]:
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model')

container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[
        Rule.sagemaker(rule_configs.create_xgboost_report()),
        ProfilerRule.sagemaker(rule_configs.ProfilerReport())
    ]
)

AWS Region: us-east-2
RoleArn: arn:aws:iam::339713131120:role/service-role/AmazonSageMaker-ExecutionRole-20240408T172301


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1


## Setting the hyperparameters

In [91]:
xgb_model.set_hyperparameters(
    max_depth = 5,
    eta = 0.2,
    gamma = 4,
    min_child_weight = 6,
    subsample = 0.7,
    objective = "binary:logistic",
    num_round = 1000
)

## Fitting the model

In [92]:
train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/train.csv"), content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/validation.csv"), content_type="csv"
)

xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-04-11-20-31-31-666


2024-04-11 20:31:31 Starting - Starting the training job...
2024-04-11 20:31:58 Starting - Preparing the instances for trainingCreateXgboostReport: InProgress
ProfilerReport: InProgress
......
2024-04-11 20:32:58 Downloading - Downloading input data...
2024-04-11 20:33:21 Downloading - Downloading the training image......
2024-04-11 20:34:18 Training - Training image download completed. Training in progress..[34m[2024-04-11 20:34:23.409 ip-10-0-226-45.us-east-2.compute.internal:8 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV in

[34m[986]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[987]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[988]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[989]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[990]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[991]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[992]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[993]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[994]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[995]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[996]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[997]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[998]#011train-error:0.00440#011validation-error:0.00314[0m
[34m[999]#011train-error:0.00440#011validation-error:0.00314[0m

2024-04-11 20:34:58 Uploading - Uploading generated training model
2024-04-

## Generating a report of the model

In [94]:
rule_output_path = xgb_model.output_path + "/" + xgb_model.latest_training_job.job_name + "/rule-output"

! aws s3 ls {rule_output_path} --recursive

! aws s3 cp {rule_output_path} ./ --recursive

from IPython.display import FileLink, FileLinks
display("Click link below to view the XGBoost Training report", FileLink("CreateXgboostReport/xgboost_report.html"))

2024-04-11 20:35:51     329692 sagemaker-xgboost-creditcard-fraud-prediction/xgboost_model/sagemaker-xgboost-2024-04-11-20-31-31-666/rule-output/ProfilerReport/profiler-output/profiler-report.html
2024-04-11 20:35:51     171067 sagemaker-xgboost-creditcard-fraud-prediction/xgboost_model/sagemaker-xgboost-2024-04-11-20-31-31-666/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb
2024-04-11 20:35:46        191 sagemaker-xgboost-creditcard-fraud-prediction/xgboost_model/sagemaker-xgboost-2024-04-11-20-31-31-666/rule-output/ProfilerReport/profiler-output/profiler-reports/BatchSize.json
2024-04-11 20:35:46        199 sagemaker-xgboost-creditcard-fraud-prediction/xgboost_model/sagemaker-xgboost-2024-04-11-20-31-31-666/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json
2024-04-11 20:35:46        126 sagemaker-xgboost-creditcard-fraud-prediction/xgboost_model/sagemaker-xgboost-2024-04-11-20-31-31-666/rule-output/ProfilerReport/profiler-output/profiler-

'Click link below to view the XGBoost Training report'

In [95]:
xgb_model.model_data

's3://sagemaker-us-east-2-339713131120/sagemaker-xgboost-creditcard-fraud-prediction/xgboost_model/sagemaker-xgboost-2024-04-11-20-31-31-666/output/model.tar.gz'

## Deploy the model

In [96]:
xgb_predictor=xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

xgb_predictor.endpoint_name

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-04-11-20-36-50-306
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-04-11-20-36-50-306
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-04-11-20-36-50-306


--------!

'sagemaker-xgboost-2024-04-11-20-36-50-306'