In [2]:
import pandas as pd
import numpy as np
from scipy import sparse
from tqdm import tqdm
import json

# Constants
mode = 'user'
method = np.dot
n_recommendations = 12
n_candidates = 19


In [3]:
def read_json(file):
    with open(file) as f:
        return json.load(f)

# Reading data file:
dataset = read_json("ml_100k.json")

movies_map = {}
distinct_movies = []
for elem in dataset:
    watched = elem[0].split(' | ')
    for movie in watched:
        if movie not in movies_map:
            movies_map[movie] = len(movies_map)
            distinct_movies.append(movie)
n_movies = len(movies_map)
print ("Number of different movies: ", n_movies)

Number of different movies:  1493


In [4]:
class CollaborativeFiltering:
    """
    Collaborative Filtering class for User-User or Item-Item predictions.
    """
    def __init__(self, method=method, mode=mode) -> None:
        """
        Initialize the collaborative filtering model.
        
        Parameters:
        - method: similarity function
        - mode: 'user' for User-User CF, 'item' for Item-Item CF
        """
        self.method = method
        self.mode = mode
        self.S = None # Similarity matrix

    def fit(self) -> None:
        """
        Compute the similarity matrix.
        """
        if self.mode == 'user':
            print("Computing User-User CF Similarity Matrix...")
            user_list = []
            for elem in dataset:
                watched_list = [0 for _ in range(n_movies)]
                watched = elem[0].split(' | ')
                for movie in watched:
                    watched_list[movies_map[movie]] = 1
                user_list.append(watched_list)
            user_matrix = np.array(user_list)
            self.S = self.method(user_matrix, user_matrix.T)
        else:  # mode == 'item'
            print("Computing Item-Item CF Similarity Matrix...")
            movie_list = [[0. for _ in range(len(dataset))] for _ in range(n_movies)]
            for i, elem in enumerate(dataset):
                movies = elem[0].split(' | ')
                for movie in movies:
                    movie_list[movies_map[movie]][i] = 1
            item_matrix = np.array(movie_list)
            self.S = self.method(item_matrix, item_matrix.T)
            
        # np.fill_diagonal(self.S, 0)

    def recommend(self, userid: int):
        watched = dataset[userid][0].split(' | ')
        movies_score = {}
        if self.mode == 'user':
            similariests = sorted(list(enumerate(self.S[userid])), key=lambda x:x[-1], reverse=True)[:n_recommendations]
            dvd = sum([e[-1] for e in similariests])
            for other_userid, similarity in similariests:
                weight = similarity / dvd
                other_watched = dataset[other_userid][0].split(' | ')
                for movie in other_watched:
                    if movie not in watched:
                        if movie not in movies_score:
                            movies_score[movie] = 0.
                        movies_score[movie] += weight
            candidate_pairs = list(sorted(movies_score.items(), key=lambda x:x[-1], reverse=True))
            candidate_items = [e[0] for e in candidate_pairs][:n_candidates]
        else:  # mode == 'item'
            for movie in watched:
                similariests = sorted(list(enumerate(self.S[movies_map[movie]])), key=lambda x: x[-1], reverse=True)[:n_recommendations]
                for movieid, weight in similariests:
                    other_movieid = distinct_movies[movieid]
                    if other_movieid not in watched:
                        if other_movieid not in movies_score:
                            movies_score[other_movieid] = 0.
                        movies_score[other_movieid] += weight
            candidate_pairs = list(sorted(movies_score.items(), key=lambda x:x[-1], reverse=True))
            candidate_items = [e[0] for e in candidate_pairs][:n_candidates]
        return candidate_items
    
    def evaluate_hit_rate(self, status: bool) -> float:
        """
        Evaluate Hit Rate@N for the CF model.
        """
        if status == True:
            if self.mode == 'user':
                print("Computing HR@10 using User-User CF...")
            else:  # mode == 'item'
                print("Computing HR@10 using Item-Item CF...")
        hits = 0
        total = len(dataset)
        for user_id, elem in enumerate(tqdm(dataset)):
            recommended = self.recommend(user_id)
            if elem[-1] in recommended:
                hits += 1
        return hits / total

In [5]:
# User-User CF
cf_user = CollaborativeFiltering(mode='user')
cf_user.fit()
# print(cf_user.evaluate_hit_rate(True))

# Item-Item CF
cf_item = CollaborativeFiltering(mode='item')
cf_item.fit()
# print(cf_item.evaluate_hit_rate(True))

Computing User-User CF Similarity Matrix...
Computing Item-Item CF Similarity Matrix...


In [None]:
temp_1 = """
Candidate Set (candidate movies): {}.
The movies I have watched (watched movies): {}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: 
"""

temp_2 = """
Candidate Set (candidate movies): {}.
The movies I have watched (watched movies): {}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: {}.
Step 2: Selecting the most featured movies from the watched movies according to my preferences (Format: [no. a watched movie.]). 
Answer: 
"""

temp_3 = """
Candidate Set (candidate movies): {}.
The movies I have watched (watched movies): {}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: {}.
Step 2: Selecting the most featured movies (at most 5 movies) from the watched movies according to my preferences in descending order (Format: [no. a watched movie.]). 
Answer: {}.
Step 3: Can you recommend 10 movies from the Candidate Set similar to the selected movies I've watched (Format: [no. a watched movie - a candidate movie])?.
Answer: 
"""

In [None]:
count = 0
total = 0
results_data = []
for i in cand_ids[:]:
    elem = data_ml_100k[i]
    seq_list = elem[0].split(' | ')[::-1]
    
    candidate_items = sort_uf_items(seq_list, user_matrix_sim[i], num_u=num_u, num_i=total_i)
    random.shuffle(candidate_items)

    input_1 = temp_1.format(', '.join(candidate_items), ', '.join(seq_list[-length_limit:]))

    try_nums = 5
    kk_flag = 1
    while try_nums:
        try:
            response = openai.Completion.create(
                      engine="text-davinci-003",
                      prompt=input_1,
                      max_tokens=512,
                      temperature=0,
                      top_p=1,
                      frequency_penalty=0,
                      presence_penalty=0,
                      n = 1,
                  )
            try_nums = 0
            kk_flag = 1
        except Exception as e:
            if 'exceeded your current quota' in str(e):

                # open_ai_keys_index +=1
                openai.api_key = open_ai_keys[open_ai_keys_index]
            time.sleep(1) 
            try_nums = try_nums-1
            kk_flag = 0

    if kk_flag == 0:
        time.sleep(5) 
        response = openai.Completion.create(
                      engine="text-davinci-003",
                      prompt=input_1,
                      max_tokens=256,
                      temperature=0,
                      top_p=1,
                      frequency_penalty=0,
                      presence_penalty=0,
                      n = 1,
                  )

    predictions_1 = response["choices"][0]['text']
    
    
    input_2 = temp_2.format(', '.join(candidate_items), ', '.join(seq_list[-length_limit:]), predictions_1)

    try_nums = 5
    kk_flag = 1
    while try_nums:
        try:
            response = openai.Completion.create(
                      engine="text-davinci-003",
                      prompt=input_2,
                      max_tokens=512,
                      temperature=0,
                      top_p=1,
                      frequency_penalty=0,
                      presence_penalty=0,
                      n = 1,
                  )
            try_nums = 0
            kk_flag = 1
        except Exception as e:
            if 'exceeded your current quota' in str(e):

                # open_ai_keys_index +=1
                openai.api_key = open_ai_keys[open_ai_keys_index]
            time.sleep(1) 
            try_nums = try_nums-1
            kk_flag = 0

    if kk_flag == 0:
        time.sleep(5) 
        response = openai.Completion.create(
                      engine="text-davinci-003",
                      prompt=input_2,
                      max_tokens=256,
                      temperature=0,
                      top_p=1,
                      frequency_penalty=0,
                      presence_penalty=0,
                      n = 1,
                  )

    predictions_2 = response["choices"][0]['text']
    
    
    input_3 = temp_3.format(', '.join(candidate_items), ', '.join(seq_list[-length_limit:]), predictions_1, predictions_2)

    try_nums = 5
    kk_flag = 1
    while try_nums:
        try:
            response = openai.Completion.create(
                      engine="text-davinci-003",
                      prompt=input_3,
                      max_tokens=512,
                      temperature=0,
                      top_p=1,
                      frequency_penalty=0,
                      presence_penalty=0,
                      n = 1,
                  )
            try_nums = 0
            kk_flag = 1
        except Exception as e:
            if 'exceeded your current quota' in str(e):

                # open_ai_keys_index +=1
                openai.api_key = open_ai_keys[open_ai_keys_index]
            time.sleep(1) 
            try_nums = try_nums-1
            kk_flag = 0

    if kk_flag == 0:
        time.sleep(5) 
        response = openai.Completion.create(
                      engine="text-davinci-003",
                      prompt=input_3,
                      max_tokens=256,
                      temperature=0,
                      top_p=1,
                      frequency_penalty=0,
                      presence_penalty=0,
                      n = 1,
                  )

    predictions = response["choices"][0]['text']
    

    hit_=0
    if elem[1] in predictions:
        count += 1
        hit_ = 1
    else:
        pass
    total +=1

    print (f"GT:{elem[1]}")
    print (f"predictions:{predictions}")