In [1]:
# from params import *

In [2]:
# bucket_name = "s3-assetcare-bucket"

# folder_prefix_parquet = "raw_data/"
# folder_prefix_csv = "raw_data_csv/"

# folder_prefix_test = "test_training_data/"


In [1]:
import boto3
import pandas as pd
import numpy as np
import random
import io

### gen data

In [2]:
# Thông tin S3
# s3_bucket = "s3-assetcare-bucket"
# s3_prefix = "test_training_data/"  # Thư mục trong bucket
s3_bucket = "swo-ngoctran-public"
s3_prefix = "mlops/raw_data/2024-01/"  # Thư mục trong bucket

# s3://swo-ngoctran-public/mlops/raw_data/2024-01/sample_.parquet

In [3]:
# Khởi tạo client S3
s3_client = boto3.client("s3")

In [4]:
# # Hàm tạo dữ liệu ngẫu nhiên và upload lên S3
# def generate_and_upload_csv(file_name, num_rows):
#     # Tạo dữ liệu giả lập
#     data = {
#         "feature_1": np.random.rand(num_rows) * 100,
#         "feature_2": np.random.rand(num_rows) * 50,
#         "feature_3": np.random.randint(1, 100, num_rows),
#         "feature_4": np.random.normal(50, 10, num_rows),
#         "feature_5": np.random.choice([0, 1], num_rows)  # Nhị phân
#     }
    
#     df = pd.DataFrame(data)
    
#     # Chuyển DataFrame thành buffer CSV (⚠️ Không ghi header)
#     csv_buffer = io.StringIO()
#     df.to_csv(csv_buffer, index=False, header=False)  # header=False để tránh lỗi

#     # Upload lên S3
#     s3_client.put_object(
#         Bucket=s3_bucket,
#         Key=f"{s3_prefix}{file_name}",
#         Body=csv_buffer.getvalue()
#     )
    
#     print(f"✅ Đã upload {file_name} lên S3: s3://{s3_bucket}/{s3_prefix}{file_name}")

#------------------------------------------------------------------
# Hàm tạo dữ liệu ngẫu nhiên và upload lên S3
def generate_and_upload_csv(file_name, num_rows):
    # Tạo dữ liệu giả lập
    data = {
        "feature_1": np.random.rand(num_rows) * 100,
        "feature_2": np.random.rand(num_rows) * 50,
        "feature_3": np.random.randint(1, 100, num_rows),
        "feature_4": np.random.normal(50, 10, num_rows),
        "feature_5": np.random.choice([0.01, 1.0, 2.0, 3.0], num_rows),  # Nhị phân
        "feature_6": np.random.normal(50, 10, num_rows),
    }
    
    df = pd.DataFrame(data)

    # **Xóa bất kỳ dòng nào bị thiếu cột**
    df.dropna(inplace=True)
    df = df.astype(np.float32).round(10)
    

    # Kiểm tra số cột trước khi lưu
    assert df.shape[1] == 6, f"⚠ Lỗi: Số cột không khớp (Expected: 5, Found: {df.shape[1]})"

    # Chuyển DataFrame thành buffer CSV (⚠️ Không ghi header, tránh lỗi)
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False, header=False, sep=",")  # Dùng dấu phẩy để phân tách

    # Upload lên S3
    s3_client.put_object(
        Bucket=s3_bucket,
        Key=f"{s3_prefix}{file_name}",
        Body=csv_buffer.getvalue()
    )
    
    print(f"✅ Đã upload {file_name} lên S3: s3://{s3_bucket}/{s3_prefix}{file_name}")

In [5]:
# Tạo và upload 2 file CSV
generate_and_upload_csv("test_data_1.csv", random.randint(100, 120))
generate_and_upload_csv("test_data_2.csv", random.randint(100, 120))

✅ Đã upload test_data_1.csv lên S3: s3://swo-ngoctran-public/mlops/raw_data/2024-01/test_data_1.csv
✅ Đã upload test_data_2.csv lên S3: s3://swo-ngoctran-public/mlops/raw_data/2024-01/test_data_2.csv


### read data from s3 and train model

In [6]:
import sagemaker
import boto3
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [7]:
# Khởi tạo session
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [8]:
# Thông tin S3
# s3_bucket = "s3-assetcare-bucket"
# prefix = "test_training_data"
s3_input_train = f"s3://{s3_bucket}/{s3_prefix}"
s3_input_train

's3://swo-ngoctran-public/mlops/raw_data/2024-01/'

In [9]:
# Lấy container mới nhất của Random Cut Forest trên SageMaker
rcf_container = sagemaker.image_uris.retrieve("randomcutforest", region)

In [10]:
# Cấu hình SageMaker Estimator
rcf_estimator = Estimator(
    image_uri=rcf_container,
    role=role,
    instance_count=1,  # Nếu dữ liệu lớn, có thể tăng lên 2-3 instance
    instance_type="ml.m5.xlarge",
    output_path=f"s3://{s3_bucket}/model_output",
    sagemaker_session=sagemaker_session,
    enable_network_isolation=True,
)

In [11]:
# **Thêm thông số training: feature_dim = 5**
rcf_estimator.set_hyperparameters(
    feature_dim=5,  # Số lượng cột đặc trưng
    num_samples_per_tree=512,  # Mặc định
    num_trees=50  # Mặc định
)

In [12]:
# Định nghĩa dữ liệu đầu vào với "ShardedByS3Key"
train_input = TrainingInput(
    s3_input_train,
    distribution="ShardedByS3Key",  # Chia dữ liệu theo file S3
    content_type="text/csv",
    input_mode="Pipe",  # Dùng Pipe Mode để stream data từ S3
)

In [13]:
# Train model
rcf_estimator.fit({"train": train_input}, wait=True)

INFO:sagemaker:Creating training-job with name: randomcutforest-2025-06-16-04-08-58-591


2025-06-16 04:09:00 Starting - Starting the training job...
2025-06-16 04:09:16 Starting - Preparing the instances for training...
2025-06-16 04:09:54 Downloading - Downloading the training image.........
2025-06-16 04:11:34 Training - Training image download completed. Training in progress.
2025-06-16 04:11:34 Uploading - Uploading generated training model
2025-06-16 04:11:34 Completed - Training job completed
[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[06/16/2025 04:11:18 INFO 140353704351552] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info',

### deployment

In [14]:
# Deploy model thành endpoint
rcf_predictor = rcf_estimator.deploy(
    initial_instance_count=1,  # instance runs endpoint
    instance_type="ml.m5.large"  # instance type
)

INFO:sagemaker:Creating model with name: randomcutforest-2025-06-16-04-12-03-443
INFO:sagemaker:Creating endpoint-config with name randomcutforest-2025-06-16-04-12-03-443
INFO:sagemaker:Creating endpoint with name randomcutforest-2025-06-16-04-12-03-443


-------!

In [18]:
endpoint_name = rcf_predictor.endpoint_name
endpoint_name

'randomcutforest-2025-06-16-04-12-03-443'

In [19]:
import boto3

sm_client =  boto3.client('sagemaker')

response = sm_client.describe_endpoint(EndpointName=endpoint_name)
print(response)

{'EndpointName': 'randomcutforest-2025-06-16-04-12-03-443', 'EndpointArn': 'arn:aws:sagemaker:us-east-1:975049948583:endpoint/randomcutforest-2025-06-16-04-12-03-443', 'EndpointConfigName': 'randomcutforest-2025-06-16-04-12-03-443', 'ProductionVariants': [{'VariantName': 'AllTraffic', 'DeployedImages': [{'SpecifiedImage': '382416733822.dkr.ecr.us-east-1.amazonaws.com/randomcutforest:1', 'ResolvedImage': '382416733822.dkr.ecr.us-east-1.amazonaws.com/randomcutforest@sha256:3cc8486cf89fd3fec8235136477e8454a27252abf1c541ec0a74e352d4175641', 'ResolutionTime': datetime.datetime(2025, 6, 16, 4, 12, 5, 690000, tzinfo=tzlocal())}], 'CurrentWeight': 1.0, 'DesiredWeight': 1.0, 'CurrentInstanceCount': 1, 'DesiredInstanceCount': 1}], 'EndpointStatus': 'InService', 'CreationTime': datetime.datetime(2025, 6, 16, 4, 12, 4, 991000, tzinfo=tzlocal()), 'LastModifiedTime': datetime.datetime(2025, 6, 16, 4, 15, 52, 38000, tzinfo=tzlocal()), 'ResponseMetadata': {'RequestId': 'a1bd2315-7921-413a-baba-19127da

### test endpoint

In [37]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

In [38]:
# set the serializer and deserializer of the endpoint and then call an inference.
rcf_predictor.serializer = CSVSerializer()
rcf_predictor.deserializer = JSONDeserializer()

In [42]:
import numpy as np

# Tạo sample data (x feature, vì feature_dim=..)
test_data = np.array([[23.5, 10.2, 45, 52.1, 45]])

# Convert thành chuỗi CSV
test_payload = ",".join(map(str, test_data[0]))

# Gửi request đến endpoint
response = rcf_predictor.predict(test_payload)

# Hiển thị kết quả
print("✅ Anomaly Score:", response)


✅ Anomaly Score: {'scores': [{'score': 1.1975720641}]}


In [43]:
import numpy as np

# Ví dụ test_data có 3 rows, 5 feature mỗi row
test_data = np.array([
    [23.5, 10.2, 45, 52.1, 45],
    [12.0, 5.0, 10, 25.0, 1],
    [56.2, 22.5, 12, 60.5, 3]
])


In [44]:
# Chuyển toàn bộ thành list các dòng CSV
csv_lines = ["{}".format(",".join(map(str, row))) for row in test_data]

# Ghép thành full payload
csv_payload = "\n".join(csv_lines)

# Gọi endpoint
response = rcf_predictor.predict(csv_payload)

# Parse kết quả
print(response['scores'])


[{'score': 1.1975720641}, {'score': 1.3898552455}, {'score': 1.4748013065}]


In [47]:
df1 = pd.read_csv("s3://swo-ngoctran-public/mlops/raw_data/2024-01/test_data_1.csv", header=None)

In [48]:
df1.head()

Unnamed: 0,0,1,2,3,4,5
0,24.4629,11.687962,69.0,38.820152,2.0,36.18758
1,59.569523,22.902044,47.0,65.25207,1.0,61.17549
2,92.67962,37.568134,3.0,58.867897,3.0,57.127277
3,35.46338,30.888903,99.0,50.016003,2.0,50.375595
4,35.338905,12.4709,36.0,55.900116,3.0,43.998856


### test source file

In [38]:
filepath = s3_input_train + "test_data_1.csv"
filepath

's3://s3-assetcare-bucket/test_training_data/test_data_1.csv'

In [39]:
df_test = pd.read_csv(filepath, header=None)
df_test.head()

Unnamed: 0,0,1,2,3,4,5
0,6.617475,44.402067,24,51.403564,2.0,38.251934
1,66.30458,36.007544,1,39.703215,2.0,37.998071
2,67.257537,8.84218,34,47.028529,1.0,58.79768
3,82.491462,26.073707,98,60.475596,3.0,64.952026
4,96.255405,30.784736,34,51.049524,2.0,52.215663


In [40]:
df_test[4].value_counts()

4
0.01    28
3.00    26
2.00    24
1.00    23
Name: count, dtype: int64

In [41]:
df_test[2].value_counts()

2
93    5
34    5
31    3
78    3
49    3
     ..
94    1
19    1
85    1
26    1
45    1
Name: count, Length: 65, dtype: int64

### appendix

In [None]:
import boto3
import pandas as pd
import numpy as np
import random
import io

# Thông tin S3
s3_bucket = "s3-assetcare-bucket"
s3_prefix = "raw_data_csv/"  # Thư mục trong bucket

# Khởi tạo client S3
s3_client = boto3.client("s3")

# Hàm tạo dữ liệu ngẫu nhiên và upload lên S3
def generate_and_upload_csv(file_name, num_rows):
    # Tạo dữ liệu giả lập
    data = {
        "feature_1": np.random.rand(num_rows) * 100,
        "feature_2": np.random.rand(num_rows) * 50,
        "feature_3": np.random.randint(1, 100, num_rows),
        "feature_4": np.random.normal(50, 10, num_rows),
        "feature_5": np.random.choice([0, 1], num_rows)  # Nhị phân
    }
    
    df = pd.DataFrame(data)
    
    # Chuyển DataFrame thành buffer CSV
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)
    
    # Upload lên S3
    s3_client.put_object(
        Bucket=s3_bucket,
        Key=f"{s3_prefix}{file_name}",
        Body=csv_buffer.getvalue()
    )
    
    print(f"Đã upload {file_name} lên S3: s3://{s3_bucket}/{s3_prefix}{file_name}")

# Tạo và upload 2 file CSV
generate_and_upload_csv("test_data_1.csv", random.randint(100, 120))
generate_and_upload_csv("test_data_2.csv", random.randint(100, 120))


In [None]:
import sagemaker
import boto3
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

# Khởi tạo session
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# Thông tin S3
s3_bucket = "s3-assetcare-bucket"
prefix = "raw_data_csv"
s3_input_train = f"s3://{s3_bucket}/{prefix}/"

# Lấy container mới nhất của Random Cut Forest trên SageMaker
rcf_container = sagemaker.image_uris.retrieve("randomcutforest", region)

# Cấu hình SageMaker Estimator
rcf_estimator = Estimator(
    image_uri=rcf_container,
    role=role,
    instance_count=1,  # Nếu dữ liệu lớn, có thể tăng lên 2-3 instance
    instance_type="ml.m5.xlarge",
    output_path=f"s3://{s3_bucket}/model_output",
    sagemaker_session=sagemaker_session,
    enable_network_isolation=True,
)

# **Thêm thông số training: feature_dim = 5**
rcf_estimator.set_hyperparameters(
    feature_dim=4,  # Số lượng cột đặc trưng - actual is 5
    num_samples_per_tree=512,  # Mặc định
    num_trees=50  # Mặc định
)

# Định nghĩa dữ liệu đầu vào với "ShardedByS3Key"
train_input = TrainingInput(
    s3_input_train,
    distribution="ShardedByS3Key",  # Chia dữ liệu theo file S3
    content_type="text/csv",
    input_mode="Pipe",  # Dùng Pipe Mode để stream data từ S3
)

# Train model
rcf_estimator.fit({"train": train_input}, wait=True)


In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# Deploy model thành endpoint
rcf_predictor = rcf_estimator.deploy(
    initial_instance_count=1,  # Số lượng instance chạy endpoint
    instance_type="ml.m5.large"  # Chọn instance phù hợp
)

# Cấu hình predictor
rcf_predictor.serializer = CSVSerializer()
rcf_predictor.deserializer = JSONDeserializer()

print(f"✅ Model đã deploy thành công tại endpoint: {rcf_predictor.endpoint_name}")
