In [1]:
# Imports
import boto3
import json
import numpy as np
import pandas as pd
import time
import datetime

In [3]:

# Configure the SDK to Personalize:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

In [4]:
!wget -N https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip -o ml-latest-small.zip

--2023-08-13 15:58:56--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2023-08-13 15:58:56 (4.47 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [6]:
!rm ml-latest-small.zip

rm: cannot remove ‘ml-latest-small.zip’: No such file or directory


In [7]:
!ls ml-latest-small

links.csv  movies.csv  ratings.csv  README.txt	tags.csv


In [10]:
interactions_data = pd.read_csv('./ml-latest-small/ratings.csv')
pd.set_option('display.max_rows', 5)
interactions_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
...,...,...,...,...
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [12]:
interactions_data.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [31]:
interactions_data = interactions_data[interactions_data['rating'] > 3]                # Keep only movies rated higher than 3 out of 5.
interactions_data = interactions_data[['userId', 'movieId', 'timestamp']]
interactions_data.rename(columns = {'userId':'USER_ID', 'movieId':'ITEM_ID', 
                              'timestamp':'TIMESTAMP'}, inplace = True)
interactions_data['EVENT_TYPE']='watch' #Adds an EVENT_TYPE column and an event type of "watch" for each interaction.
interactions_data.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE
0,1,1,964982703,watch
1,1,3,964981247,watch
2,1,6,964982224,watch
3,1,47,964983815,watch
4,1,50,964982931,watch


In [32]:
items_data = pd.read_csv('./ml-latest-small/movies.csv')
items_data.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [69]:
items_data['year'] = items_data['title'].str.extract('.*\((.*)\).*',expand = False)
items_data.head(5)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [70]:
ts= datetime.datetime(2022, 1, 1, 0, 0).strftime('%s')
print(ts)

1640995200


In [72]:
items_data["CREATION_TIMESTAMP"] = ts
items_data
# removing the title
items_data.drop(columns="title", inplace = True)

# renaming the columns to match schema
items_data.rename(columns = { 'movieId':'ITEM_ID', 'genres':'GENRES',
                              'year':'YEAR'}, inplace = True)
items_data

Unnamed: 0,ITEM_ID,GENRES,YEAR,CREATION_TIMESTAMP
0,1,Adventure|Animation|Children|Comedy|Fantasy,1995,1640995200
1,2,Adventure|Children|Fantasy,1995,1640995200
...,...,...,...,...
9740,193587,Action|Animation,2018,1640995200
9741,193609,Comedy,1991,1640995200


In [73]:
# get user ids from the interaction dataset

user_ids = interactions_data['USER_ID'].unique()
user_data = pd.DataFrame()
user_data["USER_ID"]=user_ids
user_data

Unnamed: 0,USER_ID
0,1
1,2
...,...
607,609
608,610


In [75]:
possible_genders = ['female', 'male']
random = np.random.choice(possible_genders, len(user_data), p=[0.5, 0.5])
user_data["GENDER"] = random
user_data

Unnamed: 0,USER_ID,GENDER
0,1,female
1,2,female
...,...,...
607,609,male
608,610,female


In [76]:
# Sets the same region as current Amazon SageMaker Notebook
with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:
    data = json.load(notebook_info)
    resource_arn = data['ResourceArn']
    region = resource_arn.split(':')[3]
print('region:', region)

region: us-east-1


In [82]:
s3 = boto3.client('s3')
account_id = boto3.client('sts').get_caller_identity().get('Account')
bucket_name = account_id + "-" + region + "-" + "personalizemanagedvod"
print('bucket_name:', bucket_name)

try:
    if region == "us-east-1":
        s3.create_bucket(Bucket=bucket_name)
    else:
        s3.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={'LocationConstraint': region}
            )
except s3.exceptions.BucketAlreadyOwnedByYou:
    print("Bucket already exists. Using bucket", bucket_name)

bucket_name: 182863709418-us-east-1-personalizemanagedvod


In [83]:
interactions_filename = "interactions.csv"
interactions_data.to_csv(interactions_filename, index=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(interactions_filename).upload_file(interactions_filename)

items_filename = "items.csv"
items_data.to_csv(items_filename, index=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(items_filename).upload_file(items_filename)

user_filename = "users.csv"
user_data.to_csv(user_filename, index=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(user_filename).upload_file(user_filename)

In [87]:
s3 = boto3.client("s3")
policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket_name),
                "arn:aws:s3:::{}/*".format(bucket_name)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket_name, Policy=json.dumps(policy))

{'ResponseMetadata': {'RequestId': 'Z0VYNSCARJ57E3EF',
  'HostId': 'iwIbhhv267YQ4ZRvwYzV6EaxHr/OW7lZG+OSNXOKo/IYFH3RWx615EQSF+ij62+8cGwR9Ts1SkY=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'iwIbhhv267YQ4ZRvwYzV6EaxHr/OW7lZG+OSNXOKo/IYFH3RWx615EQSF+ij62+8cGwR9Ts1SkY=',
   'x-amz-request-id': 'Z0VYNSCARJ57E3EF',
   'date': 'Sun, 13 Aug 2023 16:50:25 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

In [96]:
response = personalize.create_dataset_group(
    name='personalize-video-on-demand-ds-groups',
    domain='VIDEO_ON_DEMAND'
)

dataset_group_arn = response['datasetGroupArn']
print(json.dumps(response, indent=2))

{
  "datasetGroupArn": "arn:aws:personalize:us-east-1:182863709418:dataset-group/personalize-video-on-demand-ds-groups",
  "domain": "VIDEO_ON_DEMAND",
  "ResponseMetadata": {
    "RequestId": "95c0c5f0-6a2e-4199-905b-5a9de0672ddd",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 13 Aug 2023 16:59:28 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "143",
      "connection": "keep-alive",
      "x-amzn-requestid": "95c0c5f0-6a2e-4199-905b-5a9de0672ddd"
    },
    "RetryAttempts": 0
  }
}


In [98]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetGroup: ACTIVE


In [99]:
schema = {
  "type": "record",
  "name": "Interactions",
  "namespace": "com.amazonaws.personalize.schema",
  "fields": [
      {
          "name": "USER_ID",
          "type": "string"
      },
      {
          "name": "ITEM_ID",
          "type": "string"
      },
      {
          "name": "EVENT_TYPE",
          "type": "string"
      },
      {
          "name": "TIMESTAMP",
          "type": "long"
      }
  ],
  "version": "1.0"
}

create_interactions_schema_response = personalize.create_schema(
    name='personalize-demo-interactions-schema',
    schema=json.dumps(schema),
    domain='VIDEO_ON_DEMAND'
)

interactions_schema_arn = create_interactions_schema_response['schemaArn']
print(json.dumps(create_interactions_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:182863709418:schema/personalize-demo-interactions-schema",
  "ResponseMetadata": {
    "RequestId": "6b65d7c9-830e-4619-8f80-f86bf79ab1a0",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 13 Aug 2023 17:05:56 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "102",
      "connection": "keep-alive",
      "x-amzn-requestid": "6b65d7c9-830e-4619-8f80-f86bf79ab1a0"
    },
    "RetryAttempts": 0
  }
}


In [100]:

schema = {
  "type": "record",
  "name": "Items",
  "namespace": "com.amazonaws.personalize.schema",
  "fields": [
    {
      "name": "ITEM_ID",
      "type": "string"
    },
    {
      "name": "GENRES",
      "type": [
        "string"
      ],
      "categorical": True
    },
    {
      "name": "YEAR",
      "type": [
        "string"
      ],
      "categorical": True
    }, 
    {
      "name": "CREATION_TIMESTAMP",
      "type": "long"
    }
  ],
  "version": "1.0"
}
create_items_schema_response = personalize.create_schema(
    name='personalize-demo-items-schema',
    schema=json.dumps(schema),
    domain='VIDEO_ON_DEMAND'
)

items_schema_arn = create_items_schema_response['schemaArn']
print(json.dumps(create_items_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:182863709418:schema/personalize-demo-items-schema",
  "ResponseMetadata": {
    "RequestId": "027ad6fc-237f-4417-b9d3-f549db4b0dac",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 13 Aug 2023 17:53:29 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "95",
      "connection": "keep-alive",
      "x-amzn-requestid": "027ad6fc-237f-4417-b9d3-f549db4b0dac"
    },
    "RetryAttempts": 0
  }
}


In [101]:
schema = {
    "type": "record",
    "name": "Users",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
      {
          "name": "USER_ID",
          "type": "string"
      },
      {
          "name": "GENDER",
          "type": "string",
          "categorical": True
      }
    ],
    "version": "1.0"
}
create_users_schema_response = personalize.create_schema(
    name='personalize-demo-users-schema',
    schema=json.dumps(schema),
    domain='VIDEO_ON_DEMAND'
)

users_schema_arn = create_users_schema_response['schemaArn']
print(json.dumps(create_users_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:182863709418:schema/personalize-demo-users-schema",
  "ResponseMetadata": {
    "RequestId": "d2c810ce-ef56-4cd0-b8d0-8e4eeab617c4",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 13 Aug 2023 17:53:40 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "95",
      "connection": "keep-alive",
      "x-amzn-requestid": "d2c810ce-ef56-4cd0-b8d0-8e4eeab617c4"
    },
    "RetryAttempts": 0
  }
}


In [102]:
dataset_type = "INTERACTIONS"

create_dataset_response = personalize.create_dataset(
    name = "personalize-demo-interactions",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = interactions_schema_arn
)

interactions_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:182863709418:dataset/personalize-video-on-demand-ds-groups/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "7ec954a2-d2cd-4d36-83c0-f9b949f22c3c",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 13 Aug 2023 17:55:05 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "118",
      "connection": "keep-alive",
      "x-amzn-requestid": "7ec954a2-d2cd-4d36-83c0-f9b949f22c3c"
    },
    "RetryAttempts": 0
  }
}


In [103]:
dataset_type = "ITEMS"
create_dataset_response = personalize.create_dataset(
    name = "personalize-demo-items",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = items_schema_arn
)

items_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:182863709418:dataset/personalize-video-on-demand-ds-groups/ITEMS",
  "ResponseMetadata": {
    "RequestId": "ea68ae56-4cbb-4f52-a97f-3a49f23cfe48",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 13 Aug 2023 17:56:56 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "111",
      "connection": "keep-alive",
      "x-amzn-requestid": "ea68ae56-4cbb-4f52-a97f-3a49f23cfe48"
    },
    "RetryAttempts": 0
  }
}


In [104]:
dataset_type = "USERS"
create_dataset_response = personalize.create_dataset(
    name = "personalize-demo-users",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = users_schema_arn
)

users_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:182863709418:dataset/personalize-video-on-demand-ds-groups/USERS",
  "ResponseMetadata": {
    "RequestId": "d3e2d5fd-c2cc-44e8-801e-ffc04b996193",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 13 Aug 2023 17:57:04 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "111",
      "connection": "keep-alive",
      "x-amzn-requestid": "d3e2d5fd-c2cc-44e8-801e-ffc04b996193"
    },
    "RetryAttempts": 0
  }
}


In [106]:

iam = boto3.client("iam")

role_name = "PersonalizeRoleVODDemoRecommender"
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "personalize.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName = role_name,
    AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
)

# AmazonPersonalizeFullAccess provides access to any S3 bucket with a name that includes "personalize" or "Personalize" 
# if you would like to use a bucket with a different name, please consider creating and attaching a new policy
# that provides read access to your bucket or attaching the AmazonS3ReadOnlyAccess policy to the role
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonPersonalizeFullAccess"
iam.attach_role_policy(
    RoleName = role_name,
    PolicyArn = policy_arn
)

# Now add S3 support
iam.attach_role_policy(
    PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess',
    RoleName=role_name
)
time.sleep(60) # wait for a minute to allow IAM role policy attachment to propagate

role_arn = create_role_response["Role"]["Arn"]
print(role_arn)

arn:aws:iam::182863709418:role/PersonalizeRoleVODDemoRecommender


In [107]:
create_interactions_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "personalize-demo-import-interactions",
    datasetArn = interactions_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket_name, interactions_filename)
    },
    roleArn = role_arn
)

dataset_interactions_import_job_arn = create_interactions_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_interactions_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:182863709418:dataset-import-job/personalize-demo-import-interactions",
  "ResponseMetadata": {
    "RequestId": "7df33104-7f19-4aa6-9c98-e6e4e2d91c79",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 13 Aug 2023 18:03:11 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "124",
      "connection": "keep-alive",
      "x-amzn-requestid": "7df33104-7f19-4aa6-9c98-e6e4e2d91c79"
    },
    "RetryAttempts": 0
  }
}


In [108]:
create_items_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "personalize-demo-import-items",
    datasetArn = items_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket_name, items_filename)
    },
    roleArn = role_arn
)

dataset_items_import_job_arn = create_items_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_items_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:182863709418:dataset-import-job/personalize-demo-import-items",
  "ResponseMetadata": {
    "RequestId": "ff6850e4-9658-4971-acfb-6e16fda92cf6",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 13 Aug 2023 18:05:11 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "117",
      "connection": "keep-alive",
      "x-amzn-requestid": "ff6850e4-9658-4971-acfb-6e16fda92cf6"
    },
    "RetryAttempts": 0
  }
}


In [109]:
create_users_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "personalize-demo-import-users",
    datasetArn = users_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket_name, user_filename)
    },
    roleArn = role_arn
)

dataset_users_import_job_arn = create_users_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_users_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:182863709418:dataset-import-job/personalize-demo-import-users",
  "ResponseMetadata": {
    "RequestId": "1c1c9dfe-77da-49af-89d4-f467b9dd4390",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 13 Aug 2023 18:05:19 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "117",
      "connection": "keep-alive",
      "x-amzn-requestid": "1c1c9dfe-77da-49af-89d4-f467b9dd4390"
    },
    "RetryAttempts": 0
  }
}


In [110]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_interactions_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("Interactions DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)
    
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_items_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("Items DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)
    
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_users_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("Users DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

Interactions DatasetImportJob: CREATE IN_PROGRESS
Interactions DatasetImportJob: ACTIVE
Items DatasetImportJob: CREATE IN_PROGRESS
Items DatasetImportJob: CREATE IN_PROGRESS
Items DatasetImportJob: CREATE IN_PROGRESS
Items DatasetImportJob: CREATE IN_PROGRESS
Items DatasetImportJob: ACTIVE
Users DatasetImportJob: ACTIVE


In [113]:
available_recipes = personalize.list_recipes(domain='VIDEO_ON_DEMAND') # See a list of recommenders for the domain. 
if (len(available_recipes["recipes"])==0):
    # This is a workaround to get the recipes in case 'available_recipes["recipes"]'does not retrieve them
    available_recipes = personalize.list_recipes(domain='VIDEO_ON_DEMAND', nextToken=available_recipes["nextToken"])
display(available_recipes["recipes"])
    

[{'name': 'aws-vod-because-you-watched-x',
  'recipeArn': 'arn:aws:personalize:::recipe/aws-vod-because-you-watched-x',
  'status': 'ACTIVE',
  'creationDateTime': datetime.datetime(2019, 6, 10, 0, 0, tzinfo=tzlocal()),
  'lastUpdatedDateTime': datetime.datetime(2023, 8, 2, 19, 18, 50, 149000, tzinfo=tzlocal()),
  'domain': 'VIDEO_ON_DEMAND'},
 {'name': 'aws-vod-more-like-x',
  'recipeArn': 'arn:aws:personalize:::recipe/aws-vod-more-like-x',
  'status': 'ACTIVE',
  'creationDateTime': datetime.datetime(2019, 6, 10, 0, 0, tzinfo=tzlocal()),
  'lastUpdatedDateTime': datetime.datetime(2023, 8, 2, 19, 18, 50, 148000, tzinfo=tzlocal()),
  'domain': 'VIDEO_ON_DEMAND'},
 {'name': 'aws-vod-most-popular',
  'recipeArn': 'arn:aws:personalize:::recipe/aws-vod-most-popular',
  'status': 'ACTIVE',
  'creationDateTime': datetime.datetime(2019, 6, 10, 0, 0, tzinfo=tzlocal()),
  'lastUpdatedDateTime': datetime.datetime(2023, 8, 2, 19, 18, 50, 148000, tzinfo=tzlocal()),
  'domain': 'VIDEO_ON_DEMAND'},


In [114]:
create_recommender_response = personalize.create_recommender(
  name = 'more_like_x_demo',
  recipeArn = 'arn:aws:personalize:::recipe/aws-vod-more-like-x',
  datasetGroupArn = dataset_group_arn
)
recommender_more_like_x_arn = create_recommender_response["recommenderArn"]
print (json.dumps(create_recommender_response))

{"recommenderArn": "arn:aws:personalize:us-east-1:182863709418:recommender/more_like_x_demo", "ResponseMetadata": {"RequestId": "2424db17-8156-41f6-af5e-d1303a832bd9", "HTTPStatusCode": 200, "HTTPHeaders": {"date": "Sun, 13 Aug 2023 18:19:27 GMT", "content-type": "application/x-amz-json-1.1", "content-length": "92", "connection": "keep-alive", "x-amzn-requestid": "2424db17-8156-41f6-af5e-d1303a832bd9"}, "RetryAttempts": 0}}


In [116]:
create_recommender_response = personalize.create_recommender(
  name = 'top_picks_for_you_demo',
  recipeArn = 'arn:aws:personalize:::recipe/aws-vod-top-picks',
  datasetGroupArn = dataset_group_arn, 
)
recommender_top_picks_arn = create_recommender_response["recommenderArn"]
print (json.dumps(create_recommender_response))

ResourceAlreadyExistsException: An error occurred (ResourceAlreadyExistsException) when calling the CreateRecommender operation: Another resource with Arn arn:aws:personalize:us-east-1:182863709418:recommender/top_picks_for_you_demo already exists.

In [117]:

%%time

max_time = time.time() + 10*60*60 # 10 hours
while time.time() < max_time:

    version_response = personalize.describe_recommender(
        recommenderArn = recommender_more_like_x_arn
    )
    status = version_response["recommender"]["status"]

    if status == "ACTIVE":
        print("Build succeeded for {}".format(recommender_more_like_x_arn))
        
    elif status == "CREATE FAILED":
        print("Build failed for {}".format(recommender_more_like_x_arn))

    if status == "ACTIVE":
        break
    else:
        print("The More Like X Recommender build is still in progress")
        
    time.sleep(60)
    
while time.time() < max_time:

    version_response = personalize.describe_recommender(
        recommenderArn = recommender_top_picks_arn
    )
    status = version_response["recommender"]["status"]

    if status == "ACTIVE":
        print("Build succeeded for {}".format(recommender_top_picks_arn))
        
    elif status == "CREATE FAILED":
        print("Build failed for {}".format(recommender_top_picks_arn))

    if status == "ACTIVE":
        break
    else:
        print("The Top Pics for You Recommender build is still in progress")
        
    time.sleep(60)

The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More Like X Recommender build is still in progress
The More L

In [118]:
# reading the original data in order to have a dataframe that has both movie_ids 
# and the corresponding titles to make out recommendations easier to read.
items_df = pd.read_csv('./ml-latest-small/movies.csv')
items_df.sample(10)

Unnamed: 0,movieId,title,genres
8017,97858,Mental (2012),Comedy|Drama
1542,2077,"Journey of Natty Gann, The (1985)",Adventure|Children
...,...,...,...
6475,52831,Maniac Cop (1988),Action|Crime|Horror|Thriller
8546,115617,Big Hero 6 (2014),Action|Animation|Comedy


In [119]:
def get_movie_by_id(movie_id, movie_df):
    """
    This takes in an movie_id from a recommendation in string format,
    converts it to an int, and then does a lookup in a specified
    dataframe.
    
    A really broad try/except clause was added in case anything goes wrong.
    
    Feel free to add more debugging or filtering here to improve results if
    you hit an error.
    """
    try:
        return movie_df.loc[movie_df["movieId"]==int(movie_id)]['title'].values[0]
    except:
        print (movie_id)
        return "Error obtaining title"

In [121]:
# First pick a user
test_user_id = "1"

# Select a random item
test_item_id = "8187" #Iron Man 59315, Tangled: 81847

# Get recommendations for the user for this item
get_recommendations_response = personalize_runtime.get_recommendations(
    recommenderArn = recommender_more_like_x_arn,
    userId = test_user_id,
    itemId = test_item_id,
    numResults = 20
)

# Build a new dataframe for the recommendations
item_list = get_recommendations_response['itemList']
recommendation_list = []
for item in item_list:
    movie = get_movie_by_id(item['itemId'], items_df)
    recommendation_list.append(movie)

user_recommendations_df = pd.DataFrame(recommendation_list, columns = [get_movie_by_id(test_item_id, items_df)])

pd.options.display.max_rows = 20
display(user_recommendations_df)

8187


Unnamed: 0,Error obtaining title
0,Forrest Gump (1994)
1,"Shawshank Redemption, The (1994)"
2,Pulp Fiction (1994)
3,"Matrix, The (1999)"
4,Star Wars: Episode IV - A New Hope (1977)
5,Braveheart (1995)
6,Raiders of the Lost Ark (Indiana Jones and the...
7,American Beauty (1999)
8,Jurassic Park (1993)
9,Star Wars: Episode V - The Empire Strikes Back...


In [122]:
users_data_df = pd.read_csv('./users.csv')

def get_gender_by_id(user_id, user_df):
    """
    This takes in a user_id and then does a lookup in a specified
    dataframe.
    
    A really broad try/except clause was added in case anything goes wrong.
    
    Feel free to add more debugging or filtering here to improve results if
    you hit an error.
    """
    return user_df.loc[user_df["USER_ID"]==int(user_id)]['GENDER'].values[0]
    try:
        return user_df.loc[user_df["USER_ID"]==int(user_id)]['GENDER'].values[0]
    except:
        print (user_id)
        return "Error obtaining title"

In [123]:
# First pick a user
test_user_id = "111" # samples users: 55, 75, 76, 111

# Get recommendations for the user
get_recommendations_response = personalize_runtime.get_recommendations(
    recommenderArn = recommender_top_picks_arn,
    userId = test_user_id,
    numResults = 20
)

# Build a new dataframe for the recommendations
item_list = get_recommendations_response['itemList']
recommendation_list = []
for item in item_list:
    movie = get_movie_by_id(item['itemId'], items_df)
    recommendation_list.append(movie)

column_name = test_user_id+" ("+get_gender_by_id(test_user_id, users_data_df)+")"

user_recommendations_df = pd.DataFrame(recommendation_list, columns = [column_name])

pd.options.display.max_rows =20
display(user_recommendations_df)

Unnamed: 0,111 (male)
0,"South Park: Bigger, Longer and Uncut (1999)"
1,Coyote Ugly (2000)
2,Billy Elliot (2000)
3,Bruce Almighty (2003)
4,"Terminal, The (2004)"
5,How to Lose a Guy in 10 Days (2003)
6,Elf (2003)
7,My Big Fat Greek Wedding (2002)
8,Miss Congeniality (2000)
9,Kangaroo Jack (2003)
