## Task 3: Evaluierung 

In [30]:
# import functions
from p import stemming_tokenizer, recommendations_final, recommendations

# import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

## Daten einlesen & bereinigen
Wir lesen hier die Dateien ein und führen sie zusammen anhand der tmdbId --> So bekommen wir die entsprechende movieIds

In [31]:
df_links = pd.read_csv("ml-25m/links.csv")
df_tmdb_movies = pd.read_csv("tmdb_movies.csv", sep="\t")

# rename id column so that we can merge later
df_tmdb_movies.rename(columns={'id':'tmdbId'}, inplace=True)

# merge movielens movies with tmdb movies
df_movies = pd.merge(df_links, df_tmdb_movies, on='tmdbId')
df_movies.head()

Unnamed: 0.1,movieId,imdbId,tmdbId,Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,895,105729,79782.0,0,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,2010-06-11,0,110,"[{'english_name': 'Czech', 'iso_639_1': 'cs', ...",Released,,Venice,False,7.0,13
1,895,105729,79782.0,13349,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,2010-06-11,0,110,"[{'english_name': 'Czech', 'iso_639_1': 'cs', ...",Released,,Venice,False,7.0,13
2,181393,1684935,79782.0,0,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,2010-06-11,0,110,"[{'english_name': 'Czech', 'iso_639_1': 'cs', ...",Released,,Venice,False,7.0,13
3,181393,1684935,79782.0,13349,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,2010-06-11,0,110,"[{'english_name': 'Czech', 'iso_639_1': 'cs', ...",Released,,Venice,False,7.0,13
4,1115,114472,141210.0,1,False,,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",,...,2012-10-12,0,6,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Sleepover,False,6.6,8


Hier extrahieren wir die relevanten Daten aus den angegebenen Spalten, weil sie dort verschachtelt sind.

In [32]:
# extract data from dictionaries and list, separate key values by '|'

def extract_values(data):
    return data.apply(lambda x: '|'.join([i['name'] for i in eval(x)]))

df_movies['genres'] = extract_values(df_movies["genres"])
df_movies['spoken_languages'] = extract_values(df_movies["spoken_languages"])
df_movies['production_companies'] = extract_values(df_movies['production_companies'])
df_movies['production_countries'] = extract_values(df_movies['production_countries'])

Hier haben wir die Spalten definiert und geordnet, die wir benötigen (alle anderen wurden automatisch entfernt)

In [33]:
col_order = [
    "movieId", 
    "title", 
    "genres", 
    "overview", 
    "release_date", 
    "runtime", 
    "original_language",
    "spoken_languages", 
    "production_companies",
    "production_countries",
]

df_movies = df_movies[col_order]

Wir entfernen Filme mit einer Laufzeit von unter 30 Minuten und über 300 Minuten, da wir für unseren Recommender keine Kurz - oder Langfilme beachten wollen.

In [34]:
# drop rows with runtime under 30 --> remove short films because they are not relevant in our opinion
df_movies = df_movies.drop(df_movies[df_movies["runtime"] < 30].index)

# drop rows with runtime over 300 --> remove long films because they are not relevant in our opinion
df_movies = df_movies.drop(df_movies[df_movies["runtime"] > 300].index)

Hier wurden die Daten weiter bereinigt; MovieId Duplikate und leere Filmbeschreibungen entfernt 

In [35]:
# remove duplicate rows (there are some duplicate movies with different movieId's, thats why we use difference)
df_movies = df_movies.drop_duplicates(subset = df_movies.columns.difference(["movieId"]))

# remove all overviews that are nan
df_movies = df_movies.dropna(subset=["overview"])

Jetzt exportieren wir den bereinigten Dataframe, damit wir die enthaltenen Daten später bei der explorativen Datenanalyse untersuchen können.

In [36]:
# export df_movies to csv 
df_movies.to_csv('ml-25m/movies_clean.csv', index = False)

#### Movie Ratings
Hier lesen wir die Filmbewertungen ein.

So können wir dann Bewertungen plausibilisieren --> falls diese Daten unmöglich sind (zb eine Bewertung wurde vor der Veröffentlichung vom Film erstellt), werden sie gelöscht.

In [37]:
df_ratings = pd.read_csv("ml-25m/ratings.csv")

Wir haben Filme von der TMDB API abgefragt und gespeichert (df_movies) und wir wollen jetzt nur die Bewertungen behalten, deren Filme dich in df_movies befinden. Darum schauen wir, ob die movieIds übereinstimmen, falls ja dann werden diese Bewertungen beibehalten. Alle andere Bewertungen werden automatisch gelöscht, da sie irrelevant sind.

In [38]:
# only keep movie ratings where the movieId is in df_movies
df_ratings = df_ratings[df_ratings["movieId"].isin(list(df_movies["movieId"]))]

Danach konvertieren wir die "timestamp" Spalte zu einem "datetime" Objekt und speichern das in einer neuen Spalte namens "date". Die Spalte "timestamp" wird anschliessend gelöscht.

In [39]:
# convert timestamp to datetime
df_ratings["date"] = [datetime.fromtimestamp(x) for x in df_ratings["timestamp"]]

# drop timestamp column
df_ratings.drop("timestamp", axis = 1, inplace = True)

Jetzt werden beide Dataframes zusammengeführt.

In [40]:
# merge movies and ratings on movieId
df_movies_ratings = pd.merge(df_ratings, df_movies, on = "movieId")

Wir müssen jetzt die Daten plausibilisieren.

Wir setzen dies um, indem wir Bewertungen löschen, welche vor dem Erscheinungsdatum abgegeben wurden.

In [41]:
# drop rows where the ratings were submitted before the movie released
df_movies_ratings = df_movies_ratings[df_movies_ratings["date"] > df_movies_ratings["release_date"]]

# drop columns because they are useless now
df_movies_ratings.drop(["date", "release_date"], axis = 1, inplace = True)

Hier ist ein weiteres Beispiel von Daten zu plausibilisieren; Wir schauen, ob es Users gibt, welche den gleichen Film mehrmals bewertet haben.

In [42]:
# No user has rated the same movie twice
df_movies_ratings[df_movies_ratings.duplicated(subset = ["userId", "movieId"])]

Unnamed: 0,userId,movieId,rating,title,genres,overview,runtime,original_language,spoken_languages,production_companies,production_countries


Export von Datenbanken für seperates EDA ipynb.

In [43]:
df_movies_ratings.to_csv('ml-25m/df_movies_ratings.csv', index=False)

### NLP Implementierung

Was macht tf-idf mit Wörter, die überall vorkommen? 
- diese Wörter werden tief gewichtet

Was passiert mit Wörter die sehr selten sind? kann das Wort überahupt zur Ähnlichkeit beitragen?
- Wörter, die selten vorkommen haben ein höheres Score (höher bewertet)

Synonyme: Word2Vec (zb Auto --> PKW)

Verteilung der Ähnlichkeiten

aus dem text vektoren machen

was passiert mit dem cosinus similarity wenn wir die seltesten wörter abschneiden? besser? schlechter?

auf grund der häufigkeit matrix reduzieren

Source: https://medium.com/@cmukesh8688/tf-idf-vectorizer-scikit-learn-dbc0244a911a

Source: https://medium.com/@cmukesh8688/tf-idf-vectorizer-scikit-learn-dbc0244a911a

 Welche Attribute wollen wir für die Berechnung verwenden?
- Titel
- Overview
- Original Language
- Spoken Language
- Production Companies
- Production Countries

Für grössere Funktionen haben wir einen "helper" Datei erstellt und importieren die Funktionen.

Bevor man mit einem NLP Algorithmus beginnt, müssen die Daten zuerst darauf vorbereitet werden (Pre-processed) - Dafür haben wir im File p.py eine Funktion "stemming_tokenizer" erstellt, die verschiedene Varianten eines Wortes auf ihren gemeinsamen Wortstamm zurückführt. Somit können einzelne Wörter und auch ganze Texte besser verglichen werden. Diese Funktion löscht zudem auch noch unnötige Symbole wie Kommas und Punkte und auch "stop words". Das sind z.B. Wörter wie "the", "a" ect.

In [46]:
# copy dataframe to avoid running the whole notebook again
df_movies_nlp = df_movies.copy().reset_index(drop = True)

# loop through the chosen columns and stem the words
for col in ["overview", "spoken_languages", "original_language", "production_companies", "production_countries"]:
    df_movies_nlp[col] = stemming_tokenizer(df_movies_nlp[col])

# create new title column with stemmed title (we don't want to stem the original title)
df_movies_nlp["new_title"] = stemming_tokenizer(df_movies_nlp["title"])

Wir programmieren eine Funktion, welche zufällig die Anzahl angegebene Spalten hinzufügt, um systematisch herauszufinden, welche Kombination von Spalten (Daten) die höchsten Scores / besten Vorschläge für die Filme hat. Die Funktion heisst "create_combinations" und ist im File "helper_file.py"

Hier rufen wir die Funktion auf und dabei wird ein Dataframe erstellt, die alle Filme mit 10 Filmvorschlägen ausgibt. Das Dataframe wird danach als csv exportiert und wieder eingelesen, damit wir die Funktion nicht immer ausführen müssen.

In [12]:
#df_recommendations = recommendations(df_movies_nlp, df_movies)

#df_recommendations.sort_values(by = ["movie", "score_1"], ascending = False)

# export dataframe to csv
#df_recommendations.to_csv("rec.csv", index = False)

In [13]:
rec = pd.read_csv("rec.csv", index_col = "Unnamed: 0")
rec.head()

Unnamed: 0,combination,movie,recommended_movie_1,score_1,recommended_movie_2,score_2,recommended_movie_3,score_3,recommended_movie_4,score_4,...,recommended_movie_6,score_6,recommended_movie_7,score_7,recommended_movie_8,score_8,recommended_movie_9,score_9,recommended_movie_10,score_10
0,"['new_title', 'overview', 'spoken_languages', ...",Venice,Panic Attack,0.154583,7 Things You Don't Know About Men,0.14871,Embassy,0.148499,Letters to Santa,0.139134,...,Women of Mafia 2,0.137449,Gods,0.134155,Hardkor Disko,0.130372,Breakfast in Bed,0.130297,Walpurgis Night,0.129467
1,"['new_title', 'overview', 'spoken_languages', ...",A Place at the Table,Food and Shelter,0.229952,Food Stamped,0.085962,The Anonymous People,0.084057,Mixed Doubles,0.074443,...,Food Choices,0.071564,Farmageddon,0.063199,Growing Cities,0.062772,Eating Animals,0.062322,Permanent,0.061547
2,"['new_title', 'overview', 'spoken_languages', ...",Kingdom Come,"Chronic-Con, Episode 420: A New Dope",0.069982,Kingdom Come,0.069428,Kevin Smith: Too Fat For 40,0.061588,Rats,0.052913,...,Barbie: Princess Charm School,0.04755,The Special Relationship,0.045128,"The Death of ""Superman Lives"": What Happened?",0.045053,Selma,0.042078,Wonder,0.042025
3,"['new_title', 'overview', 'spoken_languages', ...","Camille Claudel, 1915",Rodin,0.173721,Ducoboo 2: Crazy Vacation,0.124234,Oh Mercy,0.117102,Camille Rewinds,0.108202,...,Trophy Wife,0.096784,School Life,0.093823,House of Pleasures,0.091848,Eva,0.091395,My Golden Days,0.090107
4,"['new_title', 'overview', 'spoken_languages', ...",My Kingdom,White Vengeance,0.09808,The Final Master,0.080325,The Looming Storm,0.06923,Touch of the Light,0.066363,...,Buddha Mountain,0.058536,Bangkok Revenge,0.057755,The Black Devil and the White Prince,0.057549,Sword Master,0.056973,Master Z: Ip Man Legacy,0.056659


Wir erstellen hier eine neue Spalte, um die unique Kombinationen indexieren zu können.

In [14]:
# # create new column based on groupby combinations
rec["set_combination"] = rec.groupby("combination").ngroup()
rec["set_combination"].head()

Von allen Kombinationen zeigen wir die Verteilung von den Scores vom Recommended Movie 1

In [15]:
# facet wrap density plot for each combination
g = sns.FacetGrid(rec, col = "set_combination", col_wrap = 4, height = 3, aspect = 1.5)
g.map(sns.kdeplot, "score_1", shade = True)
g.set_titles(col_template = "{col_name}")
g.set_axis_labels("Score", "Density")
g.fig.suptitle("Density Plot of Score_1 by Combination", y = 1.05)
plt.show()

Test auf Film "Fast & Furious 6"

In [16]:
# # create subset for "fast & furious 6"
rec_fast6 = rec[rec["movie"] == "Fast & Furious 6"]

# # display row with the highest score_1
rec_fast6[rec_fast6["score_1"] == rec_fast6["score_1"].max()]

### Kombination Entscheidung
Wir entscheiden uns für die Kombination 19, weil die Verteilung die höchsten Werte annimmt. Overview kommt 1 Mal vor, das bedeutet, dass die Wörter im Overview höcher bewertet werden (das Ziel ist es ja eher anhand des Overviews Filme vorzuschlagen)

In [136]:
# cols from combination 19
cols = ['new_title', 'overview', 'spoken_languages', 'original_language', 'production_companies', 'production_countries', 'new_title', 'new_title', 'overview', 'spoken_languages', 'spoken_languages', 'spoken_languages', 'spoken_languages', 'original_language', 'original_language', 'original_language', 'original_language', 'original_language', 'original_language', 'original_language', 'production_companies', 'production_companies', 'production_companies', 'production_companies', 'production_companies', 'production_companies', 'production_companies', 'production_companies', 'production_companies', 'production_companies', 'production_countries', 'production_countries', 'production_countries', 'production_countries', 'production_countries', 'production_countries', 'production_countries', 'production_countries']

## LSA
LSA (LAtent Semantic Analysis) auch bekannt als LSI (Latent Semantic Index) verwendet ein- Bag-of-Word-Modell (BoW), das zu einer Term-Dokument-Matrix führt (Vorkommen von Begriffen in einem Dokument)

Die Zeilen stehen für Begriffe und die Spalten für Dokumente. LSA lernt latente Themen, indem eine Matrixzerlegung der Dokument-Term-Matrix unter Verwendung der Singulärwertzerlegung durchgeführt wird. LSA Wird in der Regel zur Dimensionsreduzierung eingesetzt.

### SVD (Singular Value Decomposition) - Singulärwertzerlegung
SVD ist eine Matrixfaktorisierungsmethode, die eine Matrix als Produkt zweier Matrizen darstellt.

A: Input data matrix
m x n matrix (m documents, n terms)

U: Left singular vectors
m x r matrix (m documents, r concepts)

Sigma: Singular values
r x r diagonal matrix

V: Right singular vectors
n x r matrix (n terms, r concepts)

In [None]:
# call funtion in p.py
from p import lsa_final

recommendations_lsa = lsa_final(["Fast & Furious 6", "Interstellar"], df_movies_nlp, cols)
print(recommendations_lsa)

## Doc2Vec

Vector size: The vector size is the length of the learned word vectors. A larger vector size can capture more information about the words and documents, but may also require more computational resources and data to train.

Window size: The window size is the number of words around a target word that are used to predict the target word. A larger window size can capture more context for each word, but may also require more data to train.

Number of epochs: The number of epochs is the number of times the model is trained on the dataset. A larger number of epochs can improve the model's performance, but may also increase the training time.

Minimum word count: The minimum word count is the minimum number of times a word must appear in the dataset to be included in the model. A higher minimum word count can reduce the size of the model and improve its performance, but may also exclude useful information.

Sample: This parameter controls the downsampling of frequent words during training. Words that appear very frequently in the dataset, such as stop words, can dominate the training process and reduce the model's ability to learn meaningful embeddings for other words. Setting a high sample value can help mitigate this issue.

In [129]:

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split
from gensim.utils import simple_preprocess
from sklearn.model_selection import cross_val_score

# Create a list of TaggedDocument objects
documents = [TaggedDocument(words=row['overview'], tags=[row['title']]) for _, row in df_movies_nlp.iterrows()]

# Split movie data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(documents, df_movies_nlp['title'], test_size=0.2)

In [131]:
def create_doc2vec_model(documents, n_vector_size, n_window, n_epochs):
    # Create a Doc2Vec model with vector size of 150
    model = Doc2Vec(documents, vector_size=150, window=10, min_count=1, epochs=100, sample=1e-5, negative=5, workers=4)
    # Build a Volabulary
    model.build_vocab(documents)
    # Train the model
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

    return model

# model = create_doc2vec_model(X_train)

In [134]:

def get_recommendations(model, df, cols, title, n=5):

    df["doc2vec"] = df[cols].apply(" ".join, axis=1)

    if title not in df_movies_nlp['title'].values:
        return []

    # Find the index of the movie
    idx = df[df['title'] == title].index[0]

    # Pre-process the movie description
    words = simple_preprocess(df.loc[idx, 'doc2vec'])
    
    # Get the movie embedding
    movie_vec = model.infer_vector(words)
    
    # Find the most similar movies
    sims = model.docvecs.most_similar([movie_vec], topn=n)
    
    # Return the titles of the most similar movies
    return sims


#print(get_recommendations(df_movies_nlp, cols, "Fast & Furious 6"))

In [135]:
for i in range(10):
    # create model and change parameters
    vs = 100
    window = 5
    epochs = 20

    model = create_doc2vec_model(X_train, vs, window, epochs)

    print(get_recommendations(model, df_movies_nlp, cols, "Fast & Furious 6"))

    vs += 5

    window += 5

    epochs += 20



  sims = model.docvecs.most_similar([movie_vec], topn=n)


[('The Garden of Sinners: Recalled Out Summer', 0.23893490433692932), ('Romans', 0.22142668068408966), ('Tsukiji Wonderland', 0.21877647936344147), ('The Honor List', 0.21861490607261658), ('Skin Trade', 0.21815046668052673)]
[('El hada buena - Una fábula peronista', 0.2569657862186432), ('Rejoice and Shout', 0.2566133141517639), ('The D Train', 0.2507208287715912), ('12 Citizens', 0.24935105443000793), ('Chef vs. Science: The Ultimate Kitchen Challenge', 0.24759642779827118)]
[('The Jetsons & WWE: Robo-WrestleMania!', 0.23363178968429565), ('I, Dalio', 0.2315162867307663), ('Alien Opponent', 0.22672204673290253), ('LOL', 0.22607222199440002), ('That Summer', 0.21804308891296387)]
[('Love and Other Cults', 0.22718995809555054), ("The Night Visitor 2: Heather's Story", 0.21141240000724792), ('Janeane Garofalo: If You Will', 0.20786845684051514), ('Claire Darling', 0.2065325230360031), ('Chaplin in Bali', 0.20568905770778656)]
[('Chasing Tyson', 0.2671651244163513), ('Come As You Are', 0

## Train Test Split
Wir nehmen den Datensatz und schneiden einen Teil davon ab. Danach nehmen wir alle Filme und pro User nehmen wir 50 Ratings ins Training und den Rest ins Test

for every user, take the movies that the user liked (rating 3.5+)
--> search for movies that are similar to the ones this user liked --> recommend 

In [19]:
# get users with more than 50 ratings
users = df_movies_ratings["userId"].value_counts()[df_movies_ratings["userId"].value_counts() > 50].index.tolist()

df_ratings_50 = df_movies_ratings[df_movies_ratings["userId"].isin(users)]

In [None]:
train = df_ratings_50.groupby("userId").sample(frac=0.8, random_state=42)
test =df_ratings_50.drop(train.index)

Wir erstellen hier eine User-Item-Matrix mit dem userId als Index und movieId als Spalten und die jeweiligen Filmbewertungen als Werte vom Train

Warum machen wir das?

In [21]:
# create user-item-movie matrix with userid as index and movieid as columns and rating as values
df_user_movie_matrix_train = train.pivot(index = "userId", columns = "movieId", values = "rating").fillna(0)

Jetzt wird aus dem User-Item-Matrix eine bnäre User-Liked-Matrix erzeugt, sprich alle Filmbewertungen über 3 werden zu 1 (liked) und alle Filmbewertungen gleich und unter 3 werden zu 0 (not liked)

Warum machen wir das?

In [22]:
# if rating is greater than 3, set value to 1, else set value to 0
df_user_movie_matrix_train[df_user_movie_matrix_train <= 3] = 0
df_user_movie_matrix_train[df_user_movie_matrix_train > 3] = 1

df_user_movie_matrix_train

movieId,895,2679,4249,4484,5904,47237,47962,71677,72491,73319,...,209053,209063,209073,209085,209119,209121,209129,209131,209133,209163
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# create user-item-movie matrix with userid as index and movieid as columns and rating as values
df_user_movie_matrix_test = test.pivot(index = "userId", columns = "movieId", values = "rating").fillna(0)

In [None]:
# if rating is greater than 3, set value to 1, else set value to 0
df_user_movie_matrix_test[df_user_movie_matrix_test <= 3] = 0
df_user_movie_matrix_test[df_user_movie_matrix_test > 3] = 1

df_user_movie_matrix_test

In [None]:
# get all liked train movies per user
liked_movies_train = {}
for user in df_user_movie_matrix_train.index:
    liked_movies_train[user] = df_user_movie_matrix_train.columns[df_user_movie_matrix_train.loc[user] == 1].tolist()

In [None]:
# wich users have empty liked movies
empty_liked_movies_train = []
for user in liked_movies_train:
    if len(liked_movies_train[user]) == 0:
        empty_liked_movies_train.append(user)

In [None]:
## get all liked test movies per user
liked_movies_test = {}
for user in df_user_movie_matrix_test.index:
    liked_movies_test[user] = df_user_movie_matrix_test.columns[df_user_movie_matrix_test.loc[user] == 1].tolist()

In [None]:
## wich users have empty liked movies
empty_liked_movies_test = []
for user in liked_movies_test:
    if len(liked_movies_test[user]) == 0:
        empty_liked_movies_test.append(user)

In [None]:
# combine both lists and unique them
empty_liked_movies = list(set(empty_liked_movies_train + empty_liked_movies_test))

#remove empty liked movies from both dictionaries
for user in empty_liked_movies:
    liked_movies_train.pop(user, None)
    liked_movies_test.pop(user, None)

In [None]:
# get name of movies in dictionary
liked_movies_train_name = {}
for user in liked_movies_train:
    liked_movies_train_name[user] = df_movies[df_movies["movieId"].isin(liked_movies_train[user])]["title"].tolist()

In [None]:
# get name of movies in dictionary
liked_movies_test_name = {}
for user in liked_movies_test:
    liked_movies_test_name[user] = df_movies[df_movies["movieId"].isin(liked_movies_test[user])]["title"].tolist()

In [None]:
from p import tfidf_rec_user
testing_movie = tfidf_rec_user(df_movies_nlp, liked_movies_train_name, cols)

In [None]:
## check per user if movie is in recommended movies
hits = 0
precision = []

for key, values in liked_movies_test_name.items():
    for value in values:
        ## check if value is in recommended movies and at which position
        if value in testing_movie[key]:
            precision.append(testing_movie[key].index(value))

print(precision)

In [None]:
from p import count

test = count(precision)

plt.bar(test.keys(), test.values(), color='steelblue')
plt.show()

In [None]:
from p import recommendations_tfidf_final
print(recommendations_tfidf_final, ["Fast & Furious 6", "Toy Story 3"], cols)

Statistische Auswertungen
Wie viele Recommendations sind gut?

HIT, Precision