In [1]:
import os
import urllib.request
import zipfile

DATA_URL = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
DATA_DIR = '../data'
DATA_FILE = 'ml-latest-small.zip'

data_path = os.path.join(DATA_DIR, DATA_FILE)

if not os.path.exists(data_path):
    urllib.request.urlretrieve(DATA_URL, data_path)
    with zipfile.ZipFile(data_path, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR)

In [2]:
%pip install pandas

import pandas as pd

# Define the paths to the files
movies_path = os.path.join(DATA_DIR, 'ml-latest-small', 'movies.csv')
ratings_path = os.path.join(DATA_DIR, 'ml-latest-small', 'ratings.csv')
tags_path = os.path.join(DATA_DIR, 'ml-latest-small', 'tags.csv')
links_path = os.path.join(DATA_DIR, 'ml-latest-small', 'links.csv')

# Load the data into pandas DataFrames
movies_df = pd.read_csv(movies_path)
ratings_df = pd.read_csv(ratings_path)
tags_df = pd.read_csv(tags_path)
links_df = pd.read_csv(links_path)

# Display the first few rows of each DataFrame
print(movies_df.head())
print(ratings_df.head())
print(tags_df.head())
print(links_df.head())

Note: you may need to restart the kernel to use updated packages.
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60

In [3]:
%pip install scikit-learn

from sklearn.metrics.pairwise import cosine_similarity

# Create a user-movie matrix
user_movie_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')

# Compute the cosine similarity matrix using the user-movie matrix
user_similarity = cosine_similarity(user_movie_matrix.fillna(0))

# Convert the array into a DataFrame
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

# Print the first few rows of the DataFrame
print(user_similarity_df.head())

Note: you may need to restart the kernel to use updated packages.
userId       1         2         3         4         5         6         7    \
userId                                                                         
1       1.000000  0.027283  0.059720  0.194395  0.129080  0.128152  0.158744   
2       0.027283  1.000000  0.000000  0.003726  0.016614  0.025333  0.027585   
3       0.059720  0.000000  1.000000  0.002251  0.005020  0.003936  0.000000   
4       0.194395  0.003726  0.002251  1.000000  0.128659  0.088491  0.115120   
5       0.129080  0.016614  0.005020  0.128659  1.000000  0.300349  0.108342   

userId       8         9         10   ...       601       602       603  \
userId                                ...                                 
1       0.136968  0.064263  0.016875  ...  0.080554  0.164455  0.221486   
2       0.027257  0.000000  0.067445  ...  0.202671  0.016866  0.011997   
3       0.004941  0.000000  0.000000  ...  0.005048  0.004892  0.024992  

In [4]:
def recommend_movies(user_id, num_recommendations):
    # Ähnlichkeitswerte für den Benutzer abrufen
    similarity_scores = user_similarity_df[user_id]

    # Benutzer nach Ähnlichkeit zum Eingabebenutzer sortieren und die Top 10 auswählen
    similar_users = similarity_scores.sort_values(ascending=False).index[1:11]

    # Filme abrufen, die von den Benutzern in den Top 10 gesehen wurden
    similar_users_movies = ratings_df[ratings_df['userId'].isin(similar_users)]

    # Durchschnittliche Bewertung für jeden Film berechnen, der von den ähnlichen Benutzern gesehen wurde
    average_ratings = similar_users_movies.groupby('movieId')['rating'].mean()

    # Filme nach durchschnittlicher Bewertung in absteigender Reihenfolge sortieren
    sorted_average_ratings = average_ratings.sort_values(ascending=False)

    # Die Top 'num_recommendations' Filme abrufen
    top_movies = sorted_average_ratings.head(num_recommendations)

    # Die vom Benutzer gesehenen Filme abrufen und nach Bewertung sortieren
    user_movies = ratings_df[ratings_df['userId'] == user_id].sort_values(by='rating', ascending=False)

    print("Empfohlene Filme:")
    for movie_id in top_movies.index:
        title = movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
        print(title)

    print("\n\nFilme, die vom Benutzer gesehen wurden:")
    for index, row in user_movies.iterrows():
        title = movies_df[movies_df['movieId'] == row['movieId']]['title'].values[0]
        print(f"{title}, Bewertung: {row['rating']}")


# 5 Filme für Benutzer 1 empfehlen
recommend_movies(2, 5)

Empfohlene Filme:
Intouchables (2011)
City of God (Cidade de Deus) (2002)
Super Troopers (2001)
Louis C.K.: Live at the Beacon Theater (2011)
Titanic (1997)


Filme, die vom Benutzer gesehen wurden:
The Jinx: The Life and Deaths of Robert Durst (2015), Bewertung: 5.0
Mad Max: Fury Road (2015), Bewertung: 5.0
Wolf of Wall Street, The (2013), Bewertung: 5.0
Warrior (2011), Bewertung: 5.0
Step Brothers (2008), Bewertung: 5.0
Inside Job (2010), Bewertung: 5.0
Good Will Hunting (1997), Bewertung: 4.5
Dark Knight, The (2008), Bewertung: 4.5
Inglourious Basterds (2009), Bewertung: 4.5
Town, The (2010), Bewertung: 4.5
Tommy Boy (1995), Bewertung: 4.0
Whiplash (2014), Bewertung: 4.0
Louis C.K.: Hilarious (2010), Bewertung: 4.0
Inception (2010), Bewertung: 4.0
Shutter Island (2010), Bewertung: 4.0
Departed, The (2006), Bewertung: 4.0
Talladega Nights: The Ballad of Ricky Bobby (2006), Bewertung: 4.0
Kill Bill: Vol. 1 (2003), Bewertung: 4.0
Gladiator (2000), Bewertung: 4.0
Dark Knight Rises, The 

# Echtes Modell mithilfe von scikits "surprise" 

## Schritt 1: Modell Trainiern und speichern

In [5]:
# Installation der Surprise-Bibliothek
%pip install scikit-surprise

from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
import pickle

# Laden des MovieLens-Datensatzes
reader = Reader(line_format="user item rating timestamp", sep=',', skip_lines=1)
data = Dataset.load_from_file('../data/ml-latest-small/ratings.csv', reader=reader)

# Verwenden von SVD zur Matrixfaktorisierung
svd = SVD()

# Kreuzvalidierung des Modells
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Training des Modells
trainset, testset = train_test_split(data, test_size=0.2)
svd.fit(trainset)

# Speichern des Modells
with open('../models/svd_model.pkl', 'wb') as f:
    pickle.dump(svd, f)



Note: you may need to restart the kernel to use updated packages.
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8652  0.8738  0.8836  0.8722  0.8679  0.8725  0.0063  
MAE (testset)     0.6641  0.6693  0.6796  0.6710  0.6692  0.6706  0.0050  
Fit time          0.51    0.53    0.47    0.49    0.52    0.50    0.02    
Test time         0.13    0.06    0.05    0.09    0.05    0.08    0.03    


## Schritt 2: Laden und Empfehlungen für neue Nutzer generieren

In [6]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pickle

# Laden des gespeicherten Modells
with open('../models/svd_model.pkl', 'rb') as f:
    svd = pickle.load(f)

# Laden der Filmdaten
movies_df = pd.read_csv('../data/ml-latest-small/movies.csv')

# TF-IDF-Vektorisierung der Filmgenres
# tfidf = TfidfVectorizer(stop_words='english')
# tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

# TF-IDF-Vektorisierung der Filmgenres
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split('|'))
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

# Berechnung der Ähnlichkeitsmatrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Empfehlungen generieren
def get_top_n_recommendations(user_id, svd, movies_df, watched_movies, n=10):
    all_movie_ids = set(movies_df['movieId'])
    watched_movie_ids = set(watched_movies.keys())
    movie_ids_to_predict = all_movie_ids - watched_movie_ids
    
    # Vorhersagen generieren
    predictions = [svd.predict(user_id, str(mid)) for mid in movie_ids_to_predict]
    
    # Sortieren nach vorhergesagter Bewertung
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Top-N-Empfehlungen basierend auf kollaborativer Filterung
    top_n_predictions = predictions[:n]
    
    # Inhalte-basierte Filterung
    watched_movie_indices = [movies_df[movies_df['movieId'] == mid].index[0] for mid in watched_movie_ids]
    sim_scores = list(enumerate(cosine_sim[watched_movie_indices].mean(axis=0)))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [(movies_df.iloc[idx]['movieId'], score) for idx, score in sim_scores if movies_df.iloc[idx]['movieId'] in movie_ids_to_predict]
    
    # Kombinieren der Ergebnisse
    hybrid_recommendations = []
    for movie_id, sim_score in sim_scores:
        for pred in top_n_predictions:
            if int(pred.iid) == movie_id:
                hybrid_recommendations.append((movies_df[movies_df['movieId'] == movie_id].iloc[0]['title'], 
                                               movies_df[movies_df['movieId'] == movie_id].iloc[0]['genres'], 
                                               pred.est * 0.5 + sim_score * 0.5))
                break
        if len(hybrid_recommendations) >= n:
            break
    
    return hybrid_recommendations

# Beispiel: Neue Bewertungen des Benutzers
new_user_id = '99999'  # ID für neuen Benutzer
watched_movies = {112852: 5.0, 2959: 2.0, 89745: 5.0, 122912: 5.0}  # Gesehene Filme und Bewertungen

# Generieren von Empfehlungen basierend auf neuen Bewertungen
top_n_recommendations = get_top_n_recommendations(new_user_id, svd, movies_df, watched_movies)

# Empfehlungen als Tabelle anzeigen
recommendations_df = pd.DataFrame(top_n_recommendations, columns=['Titel', 'Genre', 'Vorhergesagte Bewertung'])
print(f"\nEmpfohlene Filme für Benutzer {new_user_id} nach Bewertung bestimmter Filme:")
display(recommendations_df)





Empfohlene Filme für Benutzer 99999 nach Bewertung bestimmter Filme:


Unnamed: 0,Titel,Genre,Vorhergesagte Bewertung
0,"Great Escape, The (1963)",Action|Adventure|Drama|War,2.338865
1,"Boondock Saints, The (2000)",Action|Crime|Drama|Thriller,2.364824
2,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,2.32534
3,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,2.300769
4,Lawrence of Arabia (1962),Adventure|Drama|War,2.308797
5,"Shawshank Redemption, The (1994)",Crime|Drama,2.282325
6,Goodfellas (1990),Crime|Drama,2.217453
7,Cool Hand Luke (1967),Drama,2.19164
8,Casablanca (1942),Drama|Romance,2.162646
9,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,2.150834
