# Import Data


## Import and Install

In [350]:
%pip install pandas

import pandas as pd
import os
import urllib.request
import zipfile
import numpy as np
from surprise import Dataset, Reader, SVD, KNNBasic, accuracy
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from sklearn.linear_model import LinearRegression
from collections import defaultdict


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Get Data

In [351]:
DATA_FILE = 'ml-latest-small'
DATA_URL = f"https://files.grouplens.org/datasets/movielens/${DATA_FILE}.zip"
DATA_DIR = '../data'

data_path = os.path.join(DATA_DIR, DATA_FILE, ".zip")

if not os.path.exists(data_path):
    urllib.request.urlretrieve(DATA_URL, data_path)
    with zipfile.ZipFile(data_path, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR)

## Load Data

In [352]:
movies_path = os.path.join(DATA_DIR, DATA_FILE, 'movies.csv')
ratings_path = os.path.join(DATA_DIR, DATA_FILE, 'ratings.csv')
tags_path = os.path.join(DATA_DIR, DATA_FILE, 'tags.csv')
links_path = os.path.join(DATA_DIR, DATA_FILE, 'links.csv')

def load_data():
    """
    Lädt die Daten in DataFrames.

    Returns:
    movies_df : DataFrame
        DataFrame mit den Filmen.
    ratings_df : DataFrame
        DataFrame mit den Bewertungen.
    tags_df : DataFrame
        DataFrame mit den Tags.
    links_df : DataFrame
        DataFrame mit den Links.
    """
    movies_df = pd.read_csv(movies_path)
    ratings_df = pd.read_csv(ratings_path)
    tags_df = pd.read_csv(tags_path)
    links_df = pd.read_csv(links_path)
    
    return movies_df, ratings_df, tags_df, links_df




def load_data_as_dataset(df, reader):
    """
    Lädt die Daten in ein Surprise Dataset.

    Parameters:
    df : DataFrame
        DataFrame mit den Bewertungsdaten.
    reader : Reader
        Ein Reader-Objekt von Surprise.

    Returns:
    full_data : Trainset
        Das vollständige Trainset.
    train_set : list
        Die Trainingsdaten.
    test_set : list
        Die Testdaten.
    """
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
    full_data = data.build_full_trainset()
    train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
    return full_data, train_set, test_set

# Laden der Daten
movies_df, ratings_df, tags_df, links_df = load_data()
print("PandaFrames loaded successfully!")


PandaFrames loaded successfully!


# Data Exploration
What data exploration methods do we need?

# Train Models

## Content-Based
This is a function to train a content based recommendation model



In [353]:
def train_content_based_model(movies, ratings, tags, links) :
    print("Training content-based model")

## Collaborative Filtering - Neighborhood


In [354]:
def train_neighborhood_model(movies, ratings, tags, links) :
    print("Training neighborhood model")

## Collaborative Filtering - Matrix Factorization


In [355]:
def get_ratings_dataset(ratings_df):
    """
    Erstellt ein Surprise Dataset mit den Bewertungen für jeden Benutzer.

    Parameters:
    ratings_df : DataFrame
        DataFrame mit den Bewertungen.

    Returns:
    full_rating_dataset : Surprise Dataset
        Das vollständige Dataset
    train_rating_dataset : Surprise Dataset
        Die Trainingsdaten.
    test_rating_dataset : Surprise Dataset
        Die Testdaten.
    """
    full_rating_dataset, train_rating_dataset, test_rating_dataset = load_data_as_dataset(ratings_df, reader = Reader(line_format="user item rating timestamp", sep=","))
    print("Dataset loaded successfully!")
    return full_rating_dataset, train_rating_dataset, test_rating_dataset

def train_matrix_factorization_model(movies, ratings, tags, links):
    """
    Trainiert ein Matrixfaktorisierungsmodell.

    Parameters:
    movies : DataFrame
        DataFrame mit den Filmen.
    ratings : DataFrame
        DataFrame mit den Bewertungen.
    tags : DataFrame
        DataFrame mit den Tags.
    links : DataFrame
        DataFrame mit den Links.

    Returns:
    svd_model : SVD
        Das trainierte Modell.
    """
    full_rating_dataset, train_rating_dataset, test_rating_dataset = get_ratings_dataset(ratings)
    # Use SVD for item-based collaborative filtering
    svd_model = SVD(random_state=42)  # Set user_based to False for item-based collaborative filtering

    # Train the model on the training set
    svd_model.fit(train_rating_dataset)

    evaluation = evaluate_model(svd_model, test_rating_dataset)
    print(f"RMSE: {evaluation:.2f}")

    return svd_model

def get_prediction_for_user(user_id, movie_id, model):
    """
    Gibt die Vorhersage für einen Benutzer und einen Film zurück.

    Parameters:
    user_id : int
        Die ID des Benutzers.
    movie_id : int
        Die ID des Films.
    model : SVD
        Das trainierte Modell.

    Returns:
    prediction : float
        Die Vorhersage.
    """
    prediction = model.predict(user_id, movie_id).est
    return prediction

def get_top_n_user_recommendations(uid, predictions, n=10):
    """
    Get the top N recommendations for a single user.

    Parameters:
    uid : int
        The user id.
    predictions : list
        The list of predictions.
    n : int, optional
        The number of recommendations to return (default is 10).
    """
    user_ratings = [(iid, est) for uid_pred, iid, true_r, est, _ in predictions if uid_pred == uid]
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    return user_ratings[:n]

def print_top_n_recommendations(predictions, n=10):
    """
    Prints the top N recommendations for each user.

    Parameters:
    predictions : list
        The list of predictions.
    n : int, optional
        The number of recommendations to return for each user (default is 10).
    """
    users = {uid for uid, _, _, _, _ in predictions}
    for uid in users:
        user_ratings = get_top_n_user_recommendations(uid, predictions, n)
        print(uid, [iid for (iid, _) in user_ratings])

def evaluate_model(model, test_set):
    """
    Evaluates the model using RMSE.

    Parameters:
    model : SVD
        The trained model.
    test_set : list
        The test data.

    Returns:
    rmse_score : float
        The RMSE score.
    """
    predictions = model.test(test_set)
    rmse_score = rmse(predictions)

    return rmse_score

# # Train the matrix factorization model
# svd_model = train_matrix_factorization_model(movies_df, ratings_df, tags_df, links_df)

# prediction = get_prediction_for_user(1, 1, svd_model)
# print(f"Prediction for user 1 and movie 1: {prediction:.2f}")

## Train all Models

In [356]:
def train_models(movies, ratings, tags, links) :
    content_model = train_content_based_model(movies, ratings, tags, links)
    neighborhood_model = train_neighborhood_model(movies, ratings, tags, links)
    matrix_model = train_matrix_factorization_model(movies, ratings, tags, links)
    return content_model, neighborhood_model, matrix_model

movies_df, ratings_df, tags_df, links_df = load_data() 

content_model, neighborhood_model, matrix_model = train_models(movies=movies_df, ratings=ratings_df, tags=tags_df, links=links_df)

Training content-based model
Training neighborhood model
Dataset loaded successfully!
RMSE: 0.8807
RMSE: 0.88


# Use Models

## Content-Based

In [357]:
# This function should return a list of recommended items with their scores
# [{'movieId': 1, 'score': 0.5}, {'movieId': 2, 'score': 0.4}, {'movieId': 3, 'score': 0.3}]
def make_content_based_recommendations(user, cinema_movies, model) -> []:
    return []

## Collaborative Filtering - Neighborhood


In [358]:
# This function should return a list of recommended items with their scores
# [{'movieId': 1, 'score': 0.5}, {'movieId': 2, 'score': 0.4}, {'movieId': 3, 'score': 0.3}]
def make_neighborhood_recommendations(user_rated_movies, cinema_movies, model) -> []:
    return []

## Collaborative Filtering - Matrix Factorization


In [362]:
# def make_matrix_factorization_recommendations(user_rated_movies, movie_id, model) -> int:

#     # user = [{'movieId': 1, 'rating': 5}, {'movieId': 2, 'rating': 4}, {'movieId': 3, 'rating': 3}]
#     # Extrahiere die Nutzer- und Item-Matrizen
#     U = model.pu
#     V = model.qi

#     # Beispiel-Bewertungen des neuen Nutzers
#     new_user_ratings = [(1, 5), (2, 3), (4, 1)]  # (item_id, rating)

#     # Extrahiere die relevanten Teile der Item-Matrix und die Bewertungen
#     rated_items = [rating[0] for rating in new_user_ratings]
#     ratings = [rating[1] for rating in new_user_ratings]

#     V_rated = V[rated_items, :]

#     # Lineare Regression um den Nutzer-Vektor zu berechnen
#     reg = LinearRegression().fit(V_rated, ratings)
#     u_new = reg.coef_

#     # Berechne die Bewertung für diesen spezifischen Film
#     film_score = np.dot(V[movie_id], u_new)

#     # Beschränke die Bewertung auf den Bereich von 0 bis 5
#     film_score = np.clip(film_score, 0, 5)

#     print(f"Vorhergesagte Bewertung für den Film mit ID {movie_id} für den neuen Nutzer: {film_score}")
#     return 1

def find_similar_user(user_rated_movies, model, n_similar=1):
    """
    Find a similar user based on the given user's ratings using a pre-trained model.

    Parameters:
    user_rated_movies (list of dicts): List of ratings by the user in the form [{'movieId': int, 'rating': float}].
    model (AlgoBase): The pre-trained Surprise model.
    n_similar (int): The number of similar users to find. Default is 1.

    Returns:
    list: List of similar user IDs.
    """
    # Map movie IDs to the internal item IDs used by the model
    trainset = model.trainset
    temp_user_ratings = [(trainset.to_inner_iid(movie['movieId']), movie['rating'])
                         for movie in user_rated_movies if movie['movieId'] in trainset._raw2inner_id_items]

    # Get the latent factors for the items rated by the temporary user
    q_i = np.array([model.qi[item_id] for item_id, _ in temp_user_ratings])
    r_ui = np.array([rating for _, rating in temp_user_ratings])

    # Calculate the implicit factors (biases can be included if the model uses them)
    user_factors = np.linalg.lstsq(q_i, r_ui, rcond=None)[0]

    # Calculate the similarity of the temporary user to all other users
    similarities = []
    for other_inner_user_id in trainset.all_users():
        other_user_factors = model.pu[other_inner_user_id]
        similarity = np.dot(user_factors, other_user_factors)
        similarities.append((similarity, trainset.to_raw_uid(other_inner_user_id)))

    # Sort the similarities in descending order and get the top n_similar users
    similarities.sort(reverse=True, key=lambda x: x[0])
    similar_users = [uid for _, uid in similarities[:n_similar]]
    
    return similar_users[0]

def make_matrix_factorization_recommendations(user_rated_movies, cinema_movies, model) -> []:
    """
    Erstellt Empfehlungen für einen neuen Benutzer basierend auf Matrixfaktorisierung.

    Parameters:
    user_rated_movies : list
        Die Bewertungen des Benutzers.
    cinema_movies : list
        Die Filme im Kino.
    model : SVD
        Das trainierte Modell.

    Returns:
    results : list
        Die Empfehlungen.
    """
    similar_user = find_similar_user(user_rated_movies, model, n_similar=1)
    print(similar_user)
    results = []

    for movie in cinema_movies:
        res = model.predict(similar_user, movie['movieId'])
        results.append({'movieId': movie['movieId'], 'score': res.est})
  
    return results

user_rated_movies = [{'movieId': 1, 'rating': 5}, {'movieId': 2, 'rating': 4}, {'movieId': 3, 'rating': 3}]
cinema_movies = [{'movieId': 1}, {'movieId': 2}, {'movieId': 3}]
make_matrix_factorization_recommendations(user_rated_movies, cinema_movies, matrix_model)

247


[{'movieId': 1, 'score': 4.372892169275733},
 {'movieId': 2, 'score': 3.6107078434906925},
 {'movieId': 3, 'score': 3.3263155367563724}]

## Hybrid Recommendations

In [360]:
def make_recommendations(user_rated_movies, cinema_movies, content_model, collab_model1, collab_model2) -> []:
    content_based_recommendations = make_content_based_recommendations(user_rated_movies, cinema_movies, content_model)
    neighborhood_recommendations = make_neighborhood_recommendations(user_rated_movies, cinema_movies, collab_model1)
    matrix_factorization_recommendations = make_matrix_factorization_recommendations(user_rated_movies, cinema_movies, collab_model2)

    return []