In [1]:
import pandas as pd
import json
import numpy as np
from datetime import datetime
import boto3
import time
from time import sleep
from lxml import html
import os

### 定数

In [2]:
REGION_NAME = 'ap-northeast-1'

AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")

DATASET_GROUP_NAME = "word_suggest_recommendation"
INTERACTIONS_FILENAME = "search_log.csv" # 要修正
ROLE_NAME = "PersonalizeRoleAmazonPantryAwsSimilarItems"

# レシピ名
# SIMS_RECIPE_ARN = "arn:aws:personalize:::recipe/aws-sims"
# SIMS_RECIPE_ARN = "arn:aws:personalize:::recipe/aws-ecomm-frequently-bought-together"
SIMS_RECIPE_ARN = "arn:aws:personalize:::recipe/aws-ecomm-customers-who-viewed-x-also-viewed"
# SIMS_RECIPE_ARN = "arn:aws:personalize:::recipe/aws-similar-items"

SCHEMA_NAME = "suggest-word-1"

### データ読み込み

In [3]:
search_log_df = pd.read_csv(INTERACTIONS_FILENAME)

search_log_df.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE
0,user_001,coffee,1704067200,view
1,user_001,tea,1704067200,view
2,user_001,milk,1704067200,view
3,user_002,bread,1704153600,view
4,user_002,cheese,1704153600,view


In [4]:
search_log_df.describe(include='all')

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE
count,4913,4913,4913.0,4913
unique,30,85,,1
top,user_013,cereal,,view
freq,213,80,,4913
mean,,,1719666000.0,
std,,,9062495.0,
min,,,1704046000.0,
25%,,,1711964000.0,
50%,,,1719433000.0,
75%,,,1727557000.0,


### データセットグループの作成

In [5]:
personalize = boto3.client(
    'personalize',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=REGION_NAME,
)

personalize_runtime = boto3.client(
    'personalize-runtime', 
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=REGION_NAME,
)

create_dataset_group_response = personalize.create_dataset_group(
    name = DATASET_GROUP_NAME,
    domain='ECOMMERCE',
)

dataset_group = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

{
  "datasetGroupArn": "arn:aws:personalize:ap-northeast-1:627614495110:dataset-group/word_suggest_recommendation",
  "domain": "ECOMMERCE",
  "ResponseMetadata": {
    "RequestId": "a31bf796-63cc-49e9-bb43-b5aa451c42b6",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Mon, 02 Jun 2025 11:57:01 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "132",
      "connection": "keep-alive",
      "x-amzn-requestid": "a31bf796-63cc-49e9-bb43-b5aa451c42b6",
      "strict-transport-security": "max-age=47304000; includeSubDomains",
      "x-frame-options": "DENY",
      "cache-control": "no-cache",
      "x-content-type-options": "nosniff"
    },
    "RetryAttempts": 0
  }
}


### max_time を上限として、データセットグループの作成が完了したことを確認し続ける

In [6]:
max_time = time.time() + 3*60*60 # 3 hours

while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    if status == "ACTIVE":
        print(f"Dataset group create succeeded for {dataset_group}")
        break
    elif status == "CREATE FAILED":
        print(f"Create failed for {dataset_group}")
        break
                
    time.sleep(10)

Dataset group create succeeded for arn:aws:personalize:ap-northeast-1:627614495110:dataset-group/word_suggest_recommendation


### インタラクションデータセットのスキーマとデータセットを作成する

In [7]:
interactions_schema = schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        },
        {
            "name": "EVENT_TYPE",
            "type": "string"
        }
    ],
    "version": "1.0"
}

### すでに Schema を作っていたら削除、なければ作成

In [8]:
# スキーマの一覧を取得
response = personalize.list_schemas()
schema_arn = None

for schema in response['schemas']:
    if schema['name'] == SCHEMA_NAME:
        schema_arn = schema['schemaArn']
        break

if schema_arn:
    personalize.delete_schema(
        schemaArn=schema_arn
    )
    
    sleep(5)
        
    create_schema_response = personalize.create_schema(
        name = SCHEMA_NAME,
        domain = "ECOMMERCE",
        schema = json.dumps(interactions_schema),
    )

else:
    create_schema_response = personalize.create_schema(
        name = SCHEMA_NAME,
        domain = "ECOMMERCE",
        schema = json.dumps(interactions_schema),
    )

In [9]:
# スキーマの中身を確認
interaction_schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:ap-northeast-1:627614495110:schema/suggest-word-1",
  "ResponseMetadata": {
    "RequestId": "86ef8e9b-6aff-456b-ae36-16232e44b356",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Mon, 02 Jun 2025 11:57:37 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "85",
      "connection": "keep-alive",
      "x-amzn-requestid": "86ef8e9b-6aff-456b-ae36-16232e44b356",
      "strict-transport-security": "max-age=47304000; includeSubDomains",
      "x-frame-options": "DENY",
      "cache-control": "no-cache",
      "x-content-type-options": "nosniff"
    },
    "RetryAttempts": 0
  }
}


### 先ほど作成したスキーマを指定して、データセットグループにInteractionsデータセットを作成する

In [10]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize.create_dataset(
    name = "amazon-pantry-ints",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group,
    schemaArn = interaction_schema_arn
)

interactions_dataset = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:ap-northeast-1:627614495110:dataset/word_suggest_recommendation/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "7525e92a-af30-4ce2-9244-eecb080f286d",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Mon, 02 Jun 2025 11:57:37 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "113",
      "connection": "keep-alive",
      "x-amzn-requestid": "7525e92a-af30-4ce2-9244-eecb080f286d",
      "strict-transport-security": "max-age=47304000; includeSubDomains",
      "x-frame-options": "DENY",
      "cache-control": "no-cache",
      "x-content-type-options": "nosniff"
    },
    "RetryAttempts": 0
  }
}


In [11]:
%%time

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_response = personalize.describe_dataset(
        datasetArn = interactions_dataset
    )
    status = describe_dataset_response["dataset"]['status']
    if status == "ACTIVE":
        print(f"Dataset import succeeded for {interactions_dataset}")
        break
    elif status == "CREATE FAILED":
        print(f"Create failed for {interactions_dataset}")
        break
                
    time.sleep(60)

Dataset import succeeded for arn:aws:personalize:ap-northeast-1:627614495110:dataset/word_suggest_recommendation/INTERACTIONS
CPU times: user 38 ms, sys: 2.22 ms, total: 40.3 ms
Wall time: 1min


### S3でインタラクションCSVをステージングする
### 先ほど作成したInteractions CSVを先ほど作成したPersonalizeデータセットにアップロードする前に、S3バケットにCSVをステージングする必要がある
### S3バケットを作成し、Interactions CSVファイルをバケットにコピーしよう。

In [12]:
s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=REGION_NAME,    
)

ACCOUNT_ID = boto3.client(
    'sts',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=REGION_NAME,
).get_caller_identity().get('Account')

BUCKET_NAME = f"amazon-pantry-personalize-example-{ACCOUNT_ID}"

print(BUCKET_NAME)

if REGION_NAME == "us-east-1":
    s3.create_bucket(Bucket=BUCKET_NAME)
else:
    s3.create_bucket(
        Bucket=BUCKET_NAME,
        CreateBucketConfiguration={'LocationConstraint': REGION_NAME}
    )

amazon-pantry-personalize-example-627614495110


### S3 へのアップロード

In [13]:
boto3.Session().resource(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=REGION_NAME,
).Bucket(BUCKET_NAME).Object(
    INTERACTIONS_FILENAME).upload_file(
    INTERACTIONS_FILENAME)

### S3バケットポリシーとIAMロールの作成
### データセットのインポートジョブをPersonalizeに送信する前に、Personalizeにバケットへのアクセス権を与えるバケットポリシーとIAMロールを作成する必要があります。

In [14]:
policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:*Object",
                "s3:ListBucket"
            ],
            "Resource": [
                f"arn:aws:s3:::{BUCKET_NAME}",
                f"arn:aws:s3:::{BUCKET_NAME}/*",
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=BUCKET_NAME, Policy=json.dumps(policy))

{'ResponseMetadata': {'RequestId': 'NCTCC5KYPW7S9YX1',
  'HostId': 'm7wCNIr3+YYKCgejVBVfqjITRhGvu4nGDtdeshXEg9+JtG+pzP4J5fEFK2X1ZAdSyqSu3fQdepc=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'm7wCNIr3+YYKCgejVBVfqjITRhGvu4nGDtdeshXEg9+JtG+pzP4J5fEFK2X1ZAdSyqSu3fQdepc=',
   'x-amz-request-id': 'NCTCC5KYPW7S9YX1',
   'date': 'Mon, 02 Jun 2025 11:58:42 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

In [15]:
iam = boto3.client(
    "iam",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=REGION_NAME,
)

assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "personalize.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}

try:
    create_role_response = iam.create_role(
        RoleName = ROLE_NAME,
        AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
    )
except:
    create_role_response = iam.get_role(RoleName=ROLE_NAME)

In [16]:
role_arn = create_role_response["Role"]["Arn"]
print(role_arn)

arn:aws:iam::627614495110:role/PersonalizeRoleAmazonPantryAwsSimilarItems


### インタラクションデータセットのインポート
### S3バケットにあるステージングされたInteractions CSVを、作成したPersonalizeデータセットにインポートする準備を行う

In [17]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "amazon-pantry-interactions-import",
    datasetArn = interactions_dataset,
    dataSource = {
        "dataLocation": f"s3://{BUCKET_NAME}/{INTERACTIONS_FILENAME}"
    },
    roleArn = role_arn
)

dataset_import_job = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:ap-northeast-1:627614495110:dataset-import-job/amazon-pantry-interactions-import",
  "ResponseMetadata": {
    "RequestId": "381bc66d-1837-4845-afee-46d8acbdd34e",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Mon, 02 Jun 2025 11:58:43 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "126",
      "connection": "keep-alive",
      "x-amzn-requestid": "381bc66d-1837-4845-afee-46d8acbdd34e",
      "strict-transport-security": "max-age=47304000; includeSubDomains",
      "x-frame-options": "DENY",
      "cache-control": "no-cache",
      "x-content-type-options": "nosniff"
    },
    "RetryAttempts": 0
  }
}


### 相互作用データセットのインポートジョブの完了を待つ
### 次のセルは、両方のインポート・ジョブが完了するまで待機します。

In [18]:
%%time

in_progress_import_arns = [ dataset_import_job ]

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_import_job
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    if status == "ACTIVE":
        print(f"Dataset import succeeded for {dataset_import_job}")
        break
    elif status == "CREATE FAILED":
        print(f"Create failed for {dataset_import_job}")
        break
                
    time.sleep(60)

Dataset import succeeded for arn:aws:personalize:ap-northeast-1:627614495110:dataset-import-job/amazon-pantry-interactions-import
CPU times: user 186 ms, sys: 29.9 ms, total: 216 ms
Wall time: 4min 2s


### ソリューションとソリューションバージョンの作成
### 各データセットグループのデータのユーザパーソナライズのレシピを使って、ソリューションとソリューションのバージョンを作成します。
### まず、利用可能なパーソナライズレシピを列挙します。

In [19]:
response = personalize.list_recipes(
    domain='ECOMMERCE'
)

for recipe in response['recipes']:
    print(f"Name: {recipe['name']}, ARN: {recipe['recipeArn']}")

Name: aws-ecomm-customers-who-viewed-x-also-viewed, ARN: arn:aws:personalize:::recipe/aws-ecomm-customers-who-viewed-x-also-viewed
Name: aws-ecomm-frequently-bought-together, ARN: arn:aws:personalize:::recipe/aws-ecomm-frequently-bought-together
Name: aws-ecomm-popular-items-by-purchases, ARN: arn:aws:personalize:::recipe/aws-ecomm-popular-items-by-purchases
Name: aws-ecomm-popular-items-by-views, ARN: arn:aws:personalize:::recipe/aws-ecomm-popular-items-by-views
Name: aws-ecomm-recommended-for-you, ARN: arn:aws:personalize:::recipe/aws-ecomm-recommended-for-you


### レシピの選択

### まず、各レシピのソリューションとソリューション・バージョンを作成する。

In [20]:
# sims_create_solution_response = personalize.create_solution(
#     name = "amazon-pantry-sims-solution-example",
#     datasetGroupArn = dataset_group,
#     recipeArn = SIMS_RECIPE_ARN
# )

# sims_solution_arn = sims_create_solution_response['solutionArn']

In [21]:
# sims_solution_version_response = personalize.create_solution_version(
#     solutionArn = sims_solution_arn
# )

In [22]:
# sims_solution_version_arn = sims_solution_version_response['solutionVersionArn']
# print(json.dumps(sims_solution_version_response, indent=2))

### まず、各レシピのrecommenderを作成する。

In [23]:
sims_create_recommender_response = personalize.create_recommender(
    name = "amazon-pantry-sims-solution-example",
    datasetGroupArn = dataset_group,
    recipeArn = SIMS_RECIPE_ARN
)

sims_solution_arn = sims_create_recommender_response["recommenderArn"]

### ソリューション・バージョンがアクティブになるのを待つ
### 最後に、ソリューションバージョンが作成され終わるのを待ちます。このステップでPersonalizeはデータセットと選択されたレシピに基づいて機械学習モデルをトレーニングします。
### また、Personalizeはインタラクションのデータセットをトレーニング部分と評価部分に分割し、トレーニング済みのモデルに対するレコメンデーションの品質を評価できるようにします。
### 説明データを含むデータセットグループのソリューションバージョンは、説明データを含まないものよりもトレーニングに時間がかかることがわかります

In [26]:
%%time

max_time = time.time() + 10*60*60 # 10 hours
while time.time() < max_time:
    version_response = personalize.describe_recommender(
        recommenderArn = sims_solution_arn
    )
    status = version_response["recommender"]["status"]
    
    if status == "ACTIVE":
        print(f"Build succeeded for {sims_solution_arn}")
        break
    elif status == "CREATE FAILED":
        print(f"Build failed for {sims_solution_arn}")
        break
        
    time.sleep(60)

Build succeeded for arn:aws:personalize:ap-northeast-1:627614495110:recommender/amazon-pantry-sims-solution-example
CPU times: user 14.5 ms, sys: 4.2 ms, total: 18.7 ms
Wall time: 190 ms


In [31]:
sample_df = search_log_df.sample()
sample_df.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE
2327,user_016,egg,1709410836,view


In [46]:
get_recommendations_response = personalize_runtime.get_recommendations(
    recommenderArn = sims_solution_arn,
    itemId = sample_df["ITEM_ID"].item(),
    userId = sample_df["USER_ID"].item(),
    numResults = 3
)

In [40]:
search_query = sample_df["ITEM_ID"].item()
print(search_query)

egg


In [41]:
search_log_count_df.query(f'ITEM_ID=="{search_query}"')

Unnamed: 0,ITEM_ID,COUNT
11,egg,64


In [57]:
### search_query のワードが使われているTIMESTAMP
cleaner_timestamps = search_log_df.loc[search_log_df['ITEM_ID'] == search_query, 'TIMESTAMP'].unique()

In [58]:
co_searched_df = search_log_df[search_log_df['TIMESTAMP'].isin(cleaner_timestamps)]
co_searched_df.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE
21,user_009,egg,1704758400,view
22,user_009,flour,1704758400,view
23,user_009,water,1704758400,view
105,user_012,butter,1707609600,view
106,user_012,egg,1707609600,view


In [59]:
for dic in get_recommendations_response['itemList']:
    print(dic["itemId"])
    target_df = co_searched_df.query(f'ITEM_ID == "{dic["itemId"]}"')
    
    # 対象の TIMESTAMP をリストとして取得
    timestamps = target_df["TIMESTAMP"].tolist()
    
    # ITEM_ID が egg で、かつ TIMESTAMP が一致するデータを抽出
    egg_df = co_searched_df.query('ITEM_ID == "egg" and TIMESTAMP in @timestamps')
    
    # 両方を結合して表示（必要なら順序やインデックス調整も）
    display(pd.concat([target_df, egg_df]))

strawberry


Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE
1774,user_013,strawberry,1725486631,view
4183,user_021,strawberry,1730511830,view
4806,user_028,strawberry,1721580554,view
1773,user_013,egg,1725486631,view
4182,user_021,egg,1730511830,view
4807,user_028,egg,1721580554,view


bread


Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE
1310,user_003,bread,1729694377,view
4053,user_015,bread,1735035886,view
1309,user_003,egg,1729694377,view
4054,user_015,egg,1735035886,view


bottle


Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE
1645,user_016,bottle,1730379110,view
1646,user_016,egg,1730379110,view


### リソースの削除

In [None]:
# personalize.delete_campaign(campaignArn = campaign_arn)