# Import Data


## Import and Install

In [1]:
%pip install --upgrade pip
%pip install pandas
%pip install scikit-surprise
import pandas as pd
import numpy as np
import os
import urllib.request
import zipfile
from surprise import Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity



Collecting pip
  Downloading pip-24.1.2-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.1.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.1
    Uninstalling pip-24.1.1:
      Successfully uninstalled pip-24.1.1
Successfully installed pip-24.1.2
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Get Data

In [2]:
DATA_FILE = "ml-latest-small"
DATA_URL = f"https://files.grouplens.org/datasets/movielens/{DATA_FILE}.zip"
DATA_DIR = "../data"

ratings_path = os.path.join(DATA_DIR, DATA_FILE, "ratings.csv")

## Load Data

In [3]:
def _load_ratings_df():
    """
    This function loads the ratings from disk.

    Returns:
    - ratings_df: The ratings dataframe
    """
    # Check if the directory exists, if not, create it
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)

    data_path = os.path.join(DATA_DIR, f"{DATA_FILE}.zip")

    if not os.path.exists(data_path):
        print("Downloading data...")
        urllib.request.urlretrieve(DATA_URL, data_path)
        with zipfile.ZipFile(data_path, "r") as zip_ref:
            zip_ref.extractall(DATA_DIR)

    ratings_df = pd.read_csv(ratings_path)
    return ratings_df


def _load_ratings():
    """
    This function loads the ratings from disk.

    Returns:
    - train_set: The training set
    - test_set: The test set
    """

    ratings_df = _load_ratings_df()
    reader = Reader(line_format="user item rating timestamp", sep=",")
    data = Dataset.load_from_df(ratings_df[["userId", "movieId", "rating"]], reader)
    train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)
    return data, train_set, test_set


full_data, train_set, test_set = _load_ratings()

# Data Exploration
What data exploration methods do we need?

# Train Models

## Collaborative Filtering - Neighborhood


In [4]:
# Wir müssen bei jedem neuen Nutzer neu trainieren --> nicht effizient
# Das Dataset ist zu groß für die Speicherung in einer Variable -> Speicherprobleme bei großen Datensätzen -> nicht möglich
user = [
    {"movieId": 1, "rating": 5, "externalId": "tt0114709", "title": "Toy Story", "year": 1995},
    {"movieId": 2, "rating": 3, "externalId": "tt0113497", "title": "Jumanji", "year": 1995},
    {"movieId": 3, "rating": 4, "externalId": "tt0113228", "title": "Grumpier", "year": 1995},
]

movie_list = [{"movieId": 4, "externalId": "tt0114709", "title": "Toy Story", "year": 1995},
                {"movieId": 5, "externalId": "tt0113497", "title": "Jumanji", "year": 1995},
                {"movieId": 6, "externalId": "tt0113228", "title": "Grumpier", "year": 1995}]


def train_model(trainset):
    model = KNNBasic(k=50, sim_options={"name": "msd", "user_based": False})
    model.fit(trainset)

    return model


def make_neighborhood_based_recommendations(user_ratings, cinema_movies, model):
    """
    This function makes recommendations using neighborhood-based collaborative filtering.

    Args:
    - user_ratings: The user ratings
    - cinema_movies: The cinema movies
    - model: The neighborhood-based model

    Returns:
    - scores: The scores for the cinema movies
    """

    def _find_nearest_neighbors(user_rated_movies, n_similar=1):
        """
        Find a similar user based on the given user's ratings using a pre-trained KNNBasic model.

        Parameters:
        user_rated_movies (list of dicts): List of ratings by the user in the form [{'movieId': int, 'rating': float}].
        model (KNNBasic): The pre-trained Surprise KNNBasic model.
        n_similar (int): The number of similar users to find. Default is 1.

        Returns:
        list: List of similar user IDs.
        """
        # Step 1: Load the Data
        data = pd.read_csv("../data/ml-latest-small/ratings.csv")

        # Step 2: Create a User-Item Matrix
        ratings_matrix = data.pivot_table(
            index="userId", columns="movieId", values="rating", fill_value=0
        )

        # Prepare the new user's ratings
        new_user_ratings = pd.Series(index=ratings_matrix.columns)

        for movie in user_rated_movies:
            movie_id = movie["movieId"]  # Use movieId to match the column
            new_user_ratings[movie_id] = movie["rating"]

        # Convert the Series to a DataFrame to append it
        new_user_df = pd.DataFrame([new_user_ratings.fillna(0)])

        # Append the new user's ratings to the ratings_matrix using pd.concat
        ratings_matrix = pd.concat([ratings_matrix, new_user_df], ignore_index=True)

        # Convert the updated DataFrame to a numpy array for similarity computation
        ratings_matrix_np = ratings_matrix.to_numpy()

        # Compute cosine similarities with the updated matrix
        user_similarities = cosine_similarity(ratings_matrix_np)

        # The new user is the last row in the matrix
        input_user_index = len(ratings_matrix_np) - 1
        input_user_similarity = user_similarities[input_user_index]

        # Ignore the similarity of the user to themselves by setting it to -1
        input_user_similarity[input_user_index] = -1

        # Find the nearest user
        nearest_user_index = np.argmax(input_user_similarity)
        return nearest_user_index

    nearest_user_id = _find_nearest_neighbors(user_ratings, n_similar=1)
    results = []

    for movie in cinema_movies:
        res = model.predict(nearest_user_id, movie["movieId"])
        results.append(
            {
                "movieId": movie["movieId"],
                "score": round(res.est * 20),
                "externalId": movie["externalId"],
                "title": movie["title"],
                "year": movie["year"],
            }
        )

    return results


model = train_model(train_set)
results = make_neighborhood_based_recommendations(user, movie_list, model)
print(results)

Computing the msd similarity matrix...
Done computing similarity matrix.
[{'movieId': 4, 'score': 58, 'externalId': 'tt0114709', 'title': 'Toy Story', 'year': 1995}, {'movieId': 5, 'score': 56, 'externalId': 'tt0113497', 'title': 'Jumanji', 'year': 1995}, {'movieId': 6, 'score': 67, 'externalId': 'tt0113228', 'title': 'Grumpier', 'year': 1995}]


# Evaluation

In [5]:
def _cross_validate_model(dataset, algo, n_splits=5):
    """
    Führt eine Kreuzvalidierung für ein gegebenes Modell mit Surprise durch und berechnet den durchschnittlichen RMSE.

    :param dataset: Der Surprise-Datensatz.
    :param algo: Das Modell, das validiert werden soll.
    :param n_splits: Die Anzahl der Folds für die Kreuzvalidierung.
    :return: Der durchschnittliche RMSE über alle Folds.
    """
    # Führe die Kreuzvalidierung durch
    results = cross_validate(
        algo, dataset, measures=["RMSE", "MAE"], cv=n_splits, verbose=True
    )

    # Durchschnittlicher RMSE über alle Folds
    avg_rmse = results["test_rmse"].mean()
    print(f"Durchschnittlicher RMSE über {n_splits}-Folds: {avg_rmse}")

    # Durchschnittlicher MAE über alle Folds
    avg_mae = results["test_mae"].mean()
    print(f"Durchschnittlicher MAE über {n_splits}-Folds: {avg_mae}")

    return avg_rmse


algo = KNNBasic(k=50, sim_options={"name": "msd", "user_based": False})

# Rufe die Funktion mit dem DataFrame und dem SVD-Modell als Argumenten auf
_cross_validate_model(full_data, algo, n_splits=5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9111  0.8996  0.9117  0.9101  0.9026  0.9070  0.0050  
MAE (testset)     0.6980  0.6924  0.7021  0.7015  0.6961  0.6980  0.0036  
Fit time          1.52    1.36    1.31    1.33    1.30    1.36    0.08    
Test time         3.27    3.05    3.00    3.19    3.39    3.18    0.14    
Durchschnittlicher RMSE über 5-Folds: 0.9070271570287334
Durchschnittlicher MAE über 5-Folds: 0.6980303290543046


0.9070271570287334

# Grid Search

In [6]:
def _do_a_grid_search(dataset):
    """
    This function does a grid search to find the best hyperparameters for the neighborhood.
    """
    trainset, testset = train_test_split(dataset, test_size=0.2)
    param_grid = {
        "k": [10, 20, 30, 40, 50],
        "sim_options": {
            "name": ["cosine", "pearson", "msd", "pearson_baseline"],
            "user_based": [True, False],
        },
    }
    grid_search = GridSearchCV(KNNBasic, param_grid, measures=["rmse"], cv=3)
    grid_search.fit(dataset)
    print(grid_search.best_params["rmse"])
    print(grid_search.best_score["rmse"])

    # Bewertung auf Testdaten
    best_params = grid_search.best_params["rmse"]
    algo = KNNBasic(k=best_params["k"])
    algo.fit(trainset)
    predictions = algo.test(testset)
    test_rmse = accuracy.rmse(predictions)
    print(f"RMSE auf Testdaten: {test_rmse}")

    model = KNNBasic(**grid_search.best_params["rmse"], random_state=0)
    _cross_validate_model(
        dataset,
        model,
        n_splits=5,
    )


#_do_a_grid_search(full_data)
# RESULTS:
# {'k': 50, 'sim_options': {'name': 'msd', 'user_based': False}}
# RMSE 0.9125206230768649
# RMSE auf Testdaten: 0.9461385511621797
# Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
# RMSE (testset)    0.9009  0.9042  0.9090  0.9145  0.9054  0.9068  0.0046  
# MAE (testset)     0.6966  0.6952  0.7005  0.6997  0.6959  0.6976  0.0021  
# Fit time          2.59    3.08    2.59    2.53    2.57    2.67    0.21    
# Test time         7.03    7.65    6.94    6.93    6.88    7.09    0.29    
# Durchschnittlicher RMSE über 5-Folds: 0.9067951057251864
# Durchschnittlicher MAE über 5-Folds: 0.6975876336839881

## Item Based vs. User Based

In [8]:
algo_item_based = KNNBasic(k=50, sim_options={"name": "msd", "user_based": False})
algo_user_based = KNNBasic(k=50, sim_options={"name": "msd", "user_based": True})

print("Item-based:")
_cross_validate_model(full_data, algo_item_based, n_splits=5)

print("User-based:")
_cross_validate_model(full_data, algo_user_based, n_splits=5)

Item-based:
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9085  0.9016  0.9079  0.9076  0.9108  0.9073  0.0030  
MAE (testset)     0.6991  0.6934  0.7001  0.6973  0.7012  0.6982  0.0027  
Fit time          1.48    1.37    1.29    1.44    1.36    1.39    0.06    
Test time         3.08    2.98    3.02    2.96    3.14    3.04    0.07    
Durchschnittlicher RMSE über 5-Folds: 0.9072672650647838
Durchschnittlicher MAE über 5-Folds: 0.6982253262843163
User-based:
Computing the msd similarity matrix...
Done computing similarity

0.9503214921027773