In [11]:
import io  # noqa

from surprise import Dataset, get_dataset_dir, KNNBaseline


def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + "/ml-100k/ml-100k/u.item"
    rid_to_name = {}
    name_to_rid = {}
    with open(file_name, encoding="ISO-8859-1") as f:
        # Backup as csv.
        csv_file = "ml100k_items.csv"
        with io.open(csv_file, "w", encoding="ISO-8859-1") as out:
            out.write("id,product_id,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,s\n")
            for line in f:
                line = line.split("|")
                rid_to_name[line[0]] = line[1]
                name_to_rid[line[1]] = line[0]
                out.write(",".join(line))
        
        for line in f:
            line = line.split("|")
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


def export_transactions_csv(trainset, output_file):
    """Export transactions from the trainset to a CSV file."""
    with open(output_file, "w", encoding="ISO-8859-1") as f:
        f.write("user_id,product_id,rate\n")
        for user in range(trainset.n_users):
            for item, rating in trainset.ur[user]:
                raw_user_id = trainset.to_raw_uid(user)
                raw_item_id = trainset.to_raw_iid(item)
                f.write(f"{raw_user_id},{raw_item_id},{rating}\n")
                
# First, train the algorithm to compute the similarities between items
data = Dataset.load_builtin("ml-100k")



trainset = data.build_full_trainset()
export_transactions_csv(trainset, "ml100k_transactions.csv")

sim_options = {"name": "pearson_baseline", "user_based": False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name | Basically doesnt even care about the product category here.
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid["Toy Story (1995)"]
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (
    algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors
)
toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors)

print()
print("The 10 nearest neighbors of Toy Story are:")
for movie in toy_story_neighbors:
    print(movie)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)
