# Import Data


## Import and Install

In [1]:
%pip install --upgrade pip
%pip install pandas
%pip install scikit-surprise
import pandas as pd
import numpy as np
import os
import urllib.request
import zipfile
from surprise import Dataset, Reader, KNNBasic,accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from collections import defaultdict


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Get Data

In [2]:
DATA_URL = 'https://files.grouplens.org/datasets/movielens/ml-latest.zip'
DATA_DIR = '../data'
DATA_FILE = 'ml-latest.zip'

data_path = os.path.join(DATA_DIR, DATA_FILE)

if not os.path.exists(data_path):
    urllib.request.urlretrieve(DATA_URL, data_path)
    with zipfile.ZipFile(data_path, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR)

## Load Data

In [3]:
def load_data():
    # Define the paths to the files
    movies_path = os.path.join(DATA_DIR, 'ml-latest-small', 'movies.csv')
    ratings_path = os.path.join(DATA_DIR, 'ml-latest-small', 'ratings.csv')
    tags_path = os.path.join(DATA_DIR, 'ml-latest-small', 'tags.csv')
    links_path = os.path.join(DATA_DIR, 'ml-latest-small', 'links.csv')

    # Load the data into pandas DataFrames
    movies_df = pd.read_csv(movies_path)
    ratings_df = pd.read_csv(ratings_path)
    tags_df = pd.read_csv(tags_path)
    links_df = pd.read_csv(links_path)
    
    return movies_df, ratings_df, tags_df, links_df

def load_data_as_dataset(df, reader):
    """
    Lädt die Daten in ein Surprise Dataset.

    Parameters:
    df : DataFrame
        DataFrame mit den Bewertungsdaten.
    reader : Reader
        Ein Reader-Objekt von Surprise.

    Returns:
    full_data : Trainset
        Das vollständige Trainset.
    train_set : list
        Die Trainingsdaten.
    test_set : list
        Die Testdaten.
    """
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
    full_data = data.build_full_trainset()
    train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
    return full_data, train_set, test_set

# Laden der Daten
movies_df, ratings_df, tags_df, links_df = load_data()

# Data Exploration
What data exploration methods do we need?

# Train Models

## Content-Based
This is a function to train a content based recommendation model



In [4]:
def train_content_based_model(movies, ratings, tags, links) :
    print("Training content-based model")

## Collaborative Filtering - Neighborhood


In [6]:

#Wir müssen bei jedem neuen Nutzer neu trainieren --> nicht effizient
#Das Dataset ist zu groß für die Speicherung in einer Variable -> Speicherprobleme bei großen Datensätzen -> nicht möglich
user = [
    {"movieId": 1, "rating": 5},
    {"movieId": 2, "rating": 3},
]

movie_list = [1, 2, 3, 4, 5, 3114, 78499]

def train_model(ratings):
    from surprise import KNNBasic, Dataset, Reader
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    trainset = data.build_full_trainset()
    model = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
    model.fit(trainset)
    return model

def get_nearest_user(model, user_ratings, ratings):

    # Convert user_ratings to Surprise dataset format
    ratings = pd.DataFrame(user_ratings)
    ratings['userId'] = 'new_user'
    reader = Reader(rating_scale=(1, 5))
    user_data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    user_trainset = user_data.build_full_trainset()
    user_inner_id = user_trainset.to_inner_uid('new_user')

    # Find the nearest neighbor to the new user
    nearest_neighbors = model.get_neighbors(user_inner_id, k=1)
    nearest_user_id = model.trainset.to_raw_uid(nearest_neighbors[0])

    # Return the original userId of the nearest neighbor
    return nearest_user_id


def predict_ratings(model, nearest_user_id, movie_list):
    predictions = []
    for movie_id in movie_list:
        prediction = model.predict(nearest_user_id, movie_id)
        predictions.append((movie_id, prediction.est))
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions

# Assuming `ratings_df` is your DataFrame containing the ratings
model = train_model(ratings_df)
nearest_user_id = get_nearest_user(model, user, ratings_df)
print(nearest_user_id)
predicted_ratings = predict_ratings(model, nearest_user_id, movie_list)
print(predicted_ratings)

# Assuming [`ratings_df`](command:_github.copilot.openSymbolFromReferences?%5B%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22d%3A%5C%5Cdev%5C%5Cmovie-recommendation-model%5C%5Cnotebooks%5C%5Cmovie-recommendations.ipynb%22%2C%22_sep%22%3A1%2C%22path%22%3A%22%2Fd%3A%2Fdev%2Fmovie-recommendation-model%2Fnotebooks%2Fmovie-recommendations.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X10sZmlsZQ%3D%3D%22%7D%2C%7B%22line%22%3A39%2C%22character%22%3A11%7D%5D "d:\dev\movie-recommendation-model\notebooks\movie-recommendations.ipynb") is your DataFrame containing the ratings
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split the data into training and test set (e.g., 75% training, 25% testing)
trainset, testset = train_test_split(data, test_size=0.25)

# Train the model on the training set
model = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
model.fit(trainset)

# Predict ratings for the test set
predictions = model.test(testset)

# Compute and print the accuracy of the model
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

#RMSE (Root Mean Square Error): Measures the average magnitude of the errors in a set of predictions, without considering their direction. Lower RMSE values indicate better fit.
#MAE (Mean Absolute Error): Measures the average magnitude of the errors in a set of predictions, without considering their direction. It’s a linear score, which means all the individual differences are weighted equally. Lower MAE values indicate better fit.
print(f"RMSE: {rmse}, MAE: {mae}")


Computing the cosine similarity matrix...
Done computing similarity matrix.
2
[(78499, 4.112729059889114), (3114, 3.8492793176134312), (1, 3.8375), (2, 3.613428389547348), (3, 3.3112346367300605), (5, 3.0055588221545286), (4, 2.3582630827718467)]
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9707
MAE:  0.7479
RMSE: 0.9707148710376821, MAE: 0.7479303056189172


## Collaborative Filtering - Matrix Factorization


In [None]:
def train_matrix_factorization_model(movies, ratings, tags, links):
    print("Training matrix factorization model")
    

## Train all Models

In [None]:
def train_models(movies, ratings, tags, links) :
    train_content_based_model(movies, ratings, tags, links)
    train_neighborhood_model(movies, ratings, tags, links)
    train_matrix_factorization_model(movies, ratings, tags, links)

movies_df, ratings_df, tags_df, links_df = load_data()
train_models(movies=movies_df, ratings=ratings_df, tags=tags_df, links=links_df)

Training content-based model
Training neighborhood model
Training matrix factorization model


# Use Models

## Content-Based

In [None]:
# This function should return a list of recommended items with their scores
# [{'movieId': 1, 'score': 0.5}, {'movieId': 2, 'score': 0.4}, {'movieId': 3, 'score': 0.3}]
def make_content_based_recommendations(user, model) -> []:
    return []

## Collaborative Filtering - Neighborhood


In [None]:
# This function should return a list of recommended items with their scores
# [{'movieId': 1, 'score': 0.5}, {'movieId': 2, 'score': 0.4}, {'movieId': 3, 'score': 0.3}]
def make_neighborhood_recommendations(user, model) -> []:
    return []

## Collaborative Filtering - Matrix Factorization


In [None]:
# This function should return a list of recommended items with their scores
# [{'movieId': 1, 'score': 0.5}, {'movieId': 2, 'score': 0.4}, {'movieId': 3, 'score': 0.3}]
def make_matrix_factorization_recommendations(user, model) -> []:
    # add the user to the model
    # get the recommendations for the user
    return []

## Hybrid Recommendations

In [None]:
def make_recommendations(user, content_model, collab_model1, collab_model2) -> []:
    content_based_recommendations = make_content_based_recommendations(user, content_model)
    neighborhood_recommendations = make_neighborhood_recommendations(user, collab_model1)
    matrix_factorization_recommendations = make_matrix_factorization_recommendations(user, collab_model2)

    # Combine the recommendations from the three models
    return []

### Train the models to estimate a score for a recommendation

In [None]:
# user contains information about the user -> details tbd
# probably a list of movies the user has rated and the ratings given

# movie_list contains a list of movie ids
def recommendations_from_list(user, movie_list, content_model, collab_model1, collab_model2):
    scores = {}
    for movie in movie_list:
        try: 
            content_score = content_model.estimate(user, movie)
        except: 
            content_score = 0
        try:
            collab_score1 = collab_model1.estimate(user, movie)
        except: 
            collab_score1 = 0
        try:
            collab_score2 = collab_model2.estimate(user, movie)
        except: 
            collab_score2 = 0

        combined_score = (content_score + collab_score1 + collab_score2) / 3
        scores[movie] = combined_score
        # Create a combined score
        combined_score = (content_score + collab_score1 + collab_score2) / 3
        scores[movie] = combined_score
    # Sort the scores with best recommendations first
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_scores