In [None]:
# Import necessary libraries
import time
import numpy as np
import json
import random
import openai 

# Define the class to hold argument values
class args:
    length_limit = 8
    num_cand = 19
    random_seed = 2023
    api_key = "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"

# Set random seed for reproducibility
rseed = args.random_seed
random.seed(rseed)

# Define utility functions for reading and writing JSON files
def read_json(file):
    with open(file) as f:
        return json.load(f)

def write_json(data, file):
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

# Read the MovieLens dataset
data_ml_100k = read_json("ml_100k.json")

# Initialize OpenAI API key
openai.api_key = args.api_key
client = openai.OpenAI(api_key=args.api_key)

# --- User-Item Mapping ---
# Create a mapping of movies (items) to unique indices
u_item_dict = {}
u_item_p = 0
for elem in data_ml_100k:
    seq_list = elem[0].split(' | ')
    for movie in seq_list:
        if movie not in u_item_dict:
            u_item_dict[movie] = u_item_p
            u_item_p += 1
print(len(u_item_dict))
u_item_len = len(u_item_dict)

# --- User Interaction Matrix ---
# Create a binary user-item interaction matrix
user_list = []
for i, elem in enumerate(data_ml_100k):
    item_hot_list = [0 for _ in range(u_item_len)]
    seq_list = elem[0].split(' | ')
    for movie in seq_list:
        item_pos = u_item_dict[movie]
        item_hot_list[item_pos] = 1
    user_list.append(item_hot_list)
user_matrix = np.array(user_list)

# Compute User-User Similarity Matrix
user_matrix_sim = np.dot(user_matrix, user_matrix.transpose())

# --- Movie Popularity Dictionary ---
# Count how many times each movie is watched across all users
pop_dict = {}
for elem in data_ml_100k:
    seq_list = elem[0].split(' | ')
    for movie in seq_list:
        if movie not in pop_dict:
            pop_dict[movie] = 0
        pop_dict[movie] += 1

# --- Item-Item Mapping ---
# Create a mapping for items and their interaction with users
i_item_dict = {}
i_item_id_list = []
i_item_user_dict = {}
i_item_p = 0
for i, elem in enumerate(data_ml_100k):
    seq_list = elem[0].split(' | ')
    for movie in seq_list:
        if movie not in i_item_user_dict:
            item_hot_list = [0. for _ in range(len(data_ml_100k))]
            i_item_user_dict[movie] = item_hot_list
            i_item_dict[movie] = i_item_p
            i_item_id_list.append(movie)
            i_item_p += 1
        i_item_user_dict[movie][i] += 1

# Create the Item-Item Interaction Matrix
i_item_s_list = []
for item in i_item_id_list:
    i_item_s_list.append(i_item_user_dict[item])
item_matrix = np.array(i_item_s_list)

# Compute Item-Item Similarity Matrix
item_matrix_sim = np.dot(item_matrix, item_matrix.transpose())

# List of user indices
id_list = list(range(0, len(data_ml_100k)))

# --- User Filtering Function ---
# Generate candidate movies using User-User Collaborative Filtering (UUCF)
def sort_uf_items(target_seq, us, num_u, num_i):
    candidate_movies_dict = {} 
    sorted_us = sorted(list(enumerate(us)), key=lambda x: x[-1], reverse=True)[:num_u]
    dvd = sum([e[-1] for e in sorted_us])
    for us_i, us_v in sorted_us:
        us_w = us_v * 1.0 / dvd
        us_elem = data_ml_100k[us_i]
        us_seq_list = us_elem[0].split(' | ')

        for us_m in us_seq_list:
            if us_m not in target_seq:
                if us_m not in candidate_movies_dict:
                    candidate_movies_dict[us_m] = 0.
                candidate_movies_dict[us_m] += us_w
                
    candidate_pairs = list(sorted(candidate_movies_dict.items(), key=lambda x: x[-1], reverse=True))
    candidate_items = [e[0] for e in candidate_pairs][:num_i]
    return candidate_items

# --- Item Filtering Function ---
# Generate candidate movies using Item-Item Collaborative Filtering (IICF)
def soft_if_items(target_seq, num_i, total_i, item_matrix_sim, item_dict):
    candidate_movies_dict = {} 
    for movie in target_seq:
        sorted_is = sorted(list(enumerate(item_matrix_sim[item_dict[movie]])), key=lambda x: x[-1], reverse=True)[:num_i]
        for is_i, is_v in sorted_is:
            s_item = i_item_id_list[is_i]
            
            if s_item not in target_seq:
                if s_item not in candidate_movies_dict:
                    candidate_movies_dict[s_item] = 0.
                candidate_movies_dict[s_item] += is_v
    candidate_pairs = list(sorted(candidate_movies_dict.items(), key=lambda x: x[-1], reverse=True))
    candidate_items = [e[0] for e in candidate_pairs][:total_i]
    return candidate_items

# --- Evaluation: Hit@N Metric ---
# Evaluate recommendation accuracy and identify promising user sequences
results_data_15 = []
length_limit = args.length_limit
num_u = 12
total_i = args.num_cand
count = 0
total = 0
cand_ids = []
for i in id_list[:1000]:
    elem = data_ml_100k[i]
    seq_list = elem[0].split(' | ')
    candidate_items = sort_uf_items(seq_list, user_matrix_sim[i], num_u=num_u, num_i=total_i)

    if elem[-1] in candidate_items:
        count += 1
        cand_ids.append(i)
    total += 1
print(f'count/total:{count}/{total}={count * 1.0 / total}')
print('-----------------\n')

# --- GPT-3 Integration ---
# Templates for multi-step prompting
temp_1 = """
Candidate Set (candidate movies): {}.
The movies I have watched (watched movies): {}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: 
"""

temp_2 = """
Candidate Set (candidate movies): {}.
The movies I have watched (watched movies): {}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: {}.
Step 2: Selecting the most featured movies from the watched movies according to my preferences (Format: [no. a watched movie.]). 
Answer: 
"""

temp_3 = """
Candidate Set (candidate movies): {}.
The movies I have watched (watched movies): {}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: {}.
Step 2: Selecting the most featured movies (at most 5 movies) from the watched movies according to my preferences in descending order (Format: [no. a watched movie.]). 
Answer: {}.
Step 3: Can you recommend 10 movies from the Candidate Set similar to the selected movies I've watched (Format: [no. a watched movie - a candidate movie])?.
Answer: 
"""

# Iterate through promising user sequences and generate GPT-3 predictions
count = 0
total = 0
results_data = []
for i in cand_ids[:]:
    elem = data_ml_100k[i]
    seq_list = elem[0].split(' | ')[::-1]
    candidate_items = sort_uf_items(seq_list, user_matrix_sim[i], num_u=num_u, num_i=total_i)
    random.shuffle(candidate_items)

    # Step 1: Generate GPT-3 input
    input_1 = temp_1.format(', '.join(candidate_items), ', '.join(seq_list[-length_limit:]))

    # Try GPT-3 API call with retries
    try_nums = 5
    kk_flag = 1
    while try_nums:
        try:
            response = openai.Completion.create(
                engine="text-davinci-003",
                prompt=input_1,
                max_tokens=512,
                temperature=0,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                n=1,
            )
            try_nums = 0
            kk_flag = 1
        except Exception as e:
            if 'exceeded your current quota' in str(e):
                openai.api_key = args.api_key
            time.sleep(1)
            try_nums -= 1
            kk_flag = 0

    if kk_flag == 0:
        time.sleep(5)
        # Make the request using the new `client.completions.create` method
        response = client.completions.create(
            model="gpt-4",  # Use model instead of engine
            prompt=input_1,           # Pass your prompt
            max_tokens=256,           # Maximum tokens for the response
            temperature=0,            # Sampling temperature
            top_p=1,                  # Top-p sampling
            frequency_penalty=0,      # Penalize repeated tokens
            presence_penalty=0,       # Penalize introducing new topics
            n=1                       # Number of completions to generate
        )

        # Extract the generated text from the response
        predictions_1 = response.choices[0].text

    # Step 2: Refine with GPT-3 using additional prompts
    input_2 = temp_2.format(', '.join(candidate_items), ', '.join(seq_list[-length_limit:]), predictions_1)
    # (Continue with other prompts as in original code...)

# Save results to a JSON file
file_dir = f"./results_multi_prompting_len{length_limit}_numcand_{total_i}_seed{rseed}.json"
write_json(results_data, file_dir)


1493
count/total:170/943=0.18027571580063625
-----------------



NotFoundError: Error code: 404 - {'error': {'message': 'The model `gpt-4` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}