# Content-based Filtering

### Loading Libraries

In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import numpy as np
warnings.filterwarnings('ignore')

### Loading Datasets

In [8]:
movies = pd.read_csv('../data/ml-25m/movies.csv')
movies_with_genres = pd.read_csv('../data/movies_with_genres.csv')
movies_with_year = pd.read_csv('../data/movies_with_year.csv', sep=';')
print(movies.columns)

Index(['movieId', 'title', 'genres'], dtype='object')


### Titel bereinigen

In [9]:
# Extrahiere das Erscheinungsjahr aus dem Titel und füge es als neue Spalte hinzu
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')

# Entferne das Erscheinungsjahr aus dem Titel
movies['title'] = movies['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True)

movies without year

In [10]:
# Filme ohne Erscheinungsjahr ausgeben
movies_without_year = movies[movies['year'].isna()]


print("the number of movies without year is: ", movies['year'].isna().sum())

# Füge die fehlenden Jahre aus der neuen CSV-Datei ein, basierend auf den Titeln
movies['year'] = movies.apply(
    lambda row: movies_with_year[movies_with_year['title'] == row['title']]['year'].values[0]
    if pd.isnull(row['year']) and row['title'] in movies_with_year ['title'].values
    else row['year'],
    axis=1
)

# Filme ohne Erscheinungsjahr ausgeben
movies_without_year1 = movies[movies['year'].isna()]

print("the number of movies without year is: ", movies['year'].isna().sum())

movies_without_year1.to_csv('movies_without_year1.csv', index=False)
print("Movies without year saved to 'movies_without_year1.csv'")


# Entferne Filme ohne Erscheinungsjahr
movies = movies.dropna(subset=['year'])
movies.isnull().sum()

print(movies.head(5))

the number of movies without year is:  410
the number of movies without year is:  83
Movies without year saved to 'movies_without_year1.csv'
   movieId                        title  \
0        1                    Toy Story   
1        2                      Jumanji   
2        3             Grumpier Old Men   
3        4            Waiting to Exhale   
4        5  Father of the Bride Part II   

                                        genres  year  
0  Adventure|Animation|Children|Comedy|Fantasy  1995  
1                   Adventure|Children|Fantasy  1995  
2                               Comedy|Romance  1995  
3                         Comedy|Drama|Romance  1995  
4                                       Comedy  1995  


### Genres aufbereiten


In [11]:
print("Anzahl der Filme mit '(no genres listed)':", movies[movies['genres'] == '(no genres listed)'].shape[0])

# Zusammenführen der DataFrames basierend auf den Titeln und Jahren
movies_updated = pd.merge(movies, movies_with_genres[['title', 'year', 'genres']], on=['title', 'year'], how='left', suffixes=('', '_new'))

# Aktualisieren der Genres im movies DataFrame nur für die übereinstimmenden Einträge
movies_updated['genres'] = movies_updated['genres_new'].combine_first(movies_updated['genres'])

# Entferne die temporäre Spalte
movies_updated = movies_updated.drop(columns=['genres_new'])
movies = movies_updated

print("Anzahl der Filme mit '(no genres listed)':", movies[movies['genres'] == '(no genres listed)'].shape[0])

# Entferne "(no genres listed)" aus der Genre-Liste
movies['genres'] = movies['genres'].replace('(no genres listed)', '')

# Trenne die Genres in separate Listen
genre_list = movies['genres'].str.split('|')

# Finde alle einzigartigen Genres
all_genres = set(genre for sublist in genre_list for genre in sublist if genre)

# Erstelle für jedes Genre eine Spalte und fülle sie mit binären Werten
for genre in all_genres:
    movies[genre] = movies['genres'].apply(lambda x: int(genre in x.split('|')))

# Entferne die ursprüngliche 'genres' Spalte
movies = movies.drop(columns=['genres'])

print(movies.head(1))

Anzahl der Filme mit '(no genres listed)': 5002
Anzahl der Filme mit '(no genres listed)': 1257
   movieId                        title  year  Sport  Western  Drama  Comedy  \
0        1                    Toy Story  1995      0        0      0       1   
1        2                      Jumanji  1995      0        0      0       0   
2        3             Grumpier Old Men  1995      0        0      0       1   
3        4            Waiting to Exhale  1995      0        0      1       1   
4        5  Father of the Bride Part II  1995      0        0      0       1   

   Sci-Fi  Horror  Documentary  ...  Fantasy  Reality-TV  Thriller  War  \
0       0       0            0  ...        1           0         0    0   
1       0       0            0  ...        1           0         0    0   
2       0       0            0  ...        0           0         0    0   
3       0       0            0  ...        0           0         0    0   
4       0       0            0  ...        0    

### Nach Null-Werten suchen

In [12]:
movies.info()
movies.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62340 entries, 0 to 62339
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movieId      62340 non-null  int64 
 1   title        62340 non-null  object
 2   year         62340 non-null  object
 3   Sport        62340 non-null  int64 
 4   Western      62340 non-null  int64 
 5   Drama        62340 non-null  int64 
 6   Comedy       62340 non-null  int64 
 7   Sci-Fi       62340 non-null  int64 
 8   Horror       62340 non-null  int64 
 9   Documentary  62340 non-null  int64 
 10  Crime        62340 non-null  int64 
 11  Film-Noir    62340 non-null  int64 
 12  Adventure    62340 non-null  int64 
 13  Musical      62340 non-null  int64 
 14  Biography    62340 non-null  int64 
 15  Animation    62340 non-null  int64 
 16  Mystery      62340 non-null  int64 
 17  History      62340 non-null  int64 
 18  Music        62340 non-null  int64 
 19  Children     62340 non-nu

movieId        0
title          0
year           0
Sport          0
Western        0
Drama          0
Comedy         0
Sci-Fi         0
Horror         0
Documentary    0
Crime          0
Film-Noir      0
Adventure      0
Musical        0
Biography      0
Animation      0
Mystery        0
History        0
Music          0
Children       0
Romance        0
News           0
Fantasy        0
Reality-TV     0
Thriller       0
War            0
Family         0
Short          0
Adult          0
Game-Show      0
IMAX           0
Action         0
dtype: int64

### Feature Set: Genre

In [13]:
def normalize_title(title):
    articles = ['the', 'a', 'an']
    words = title.strip().split()
    if words[-1].strip(",").lower() in articles:
        return title.strip().lower()
    if words[0].lower() in articles:
        return ', '.join(words[1:]) + ', ' + words[0].capitalize()
    return title.lower()

def alternate_title_format(title):
    articles = ['the', 'a', 'an']
    words = title.strip().split()
    if words[0].lower() in articles:
        return ', '.join(words[1:]) + ', ' + words[0].capitalize()
    if words[-1].strip(",").lower() in articles:
        return words[-1].capitalize() + ' ' + ' '.join(words[:-1]).replace(',', '')
    return title.lower()

def get_movie_features(movies, title, year):
    # Bereinigen und Normalisieren des Titels
    normalized_title = normalize_title(title)
    alternate_title = alternate_title_format(normalized_title)

    
    # Normalisieren der Titelspalte des DataFrames
    movies['title'] = movies['title'].str.strip().str.lower()
    
    # Sicherstellen, dass die Jahr-Spalte als numerisch behandelt wird
    movies['year'] = pd.to_numeric(movies['year'], errors='coerce')

    # Filtern der Filme nach Titel und Jahr
    filtered_movies = movies[((movies['title'] == normalized_title.lower()) | (movies['title'] == alternate_title.lower())) & (movies['year'] == year)]
    if not filtered_movies.empty:
        return filtered_movies.iloc[0]
    else:
        print(f"{title} ({year}) not found")
        return None


def recommend_movies(user_rated_movies, cinema_movies):
    
    # Schritt 1: Extrahieren der Merkmale der vom Nutzer bewerteten Filme
    user_movie_features = []
    has_positive_ratings = any(movie['rating'] >= 2.5 for movie in user_rated_movies)

    for movie in user_rated_movies:
        if has_positive_ratings and movie['rating'] >= 2.5:
            features = get_movie_features(movies, movie['title'], movie['year'])
            if features is not None:
                user_movie_features.append(features.drop(['movieId', 'title', 'year']).values)
        elif not has_positive_ratings and movie['rating'] < 2.5:
            features = get_movie_features(movies, movie['title'], movie['year'])
            if features is not None:
                user_movie_features.append(features.drop(['movieId', 'title', 'year']).values)
    
    # Schritt 2: Feature-Vektorisierung
    if not user_movie_features:
        raise ValueError("Keine positiv bewerteten Filme vorhanden.")
    
    user_profile = np.mean(user_movie_features, axis=0)
    
    # Schritt 3: Ähnlichkeitsberechnung
    kino_movie_features = []
    for kino_movie in cinema_movies:
        features = get_movie_features(movies, kino_movie['title'], kino_movie['year'])
        if features is not None:
            kino_movie_features.append(features.drop(['movieId', 'title', 'year']).values)
    
    if not kino_movie_features:
        raise ValueError("Keine Kino-Filme mit passenden Features gefunden.")
    
    similarities = cosine_similarity([user_profile], kino_movie_features)[0]
    
    # Schritt 4: Sortierung und Ausgabe
    cinema_movies_with_similarity = []
    for i, kino_movie in enumerate(cinema_movies):
        kino_movie_with_similarity = kino_movie.copy()
        kino_movie_with_similarity['similarity'] = similarities[i]
        cinema_movies_with_similarity.append(kino_movie_with_similarity)
    
    if has_positive_ratings:
        sorted_cinema_movies = sorted(cinema_movies_with_similarity, key=lambda x: x['similarity'], reverse=True)
    else:
        sorted_cinema_movies = sorted(cinema_movies_with_similarity, key=lambda x: x['similarity'])
    
    return sorted_cinema_movies

### Test

In [14]:
# Beispiel-Daten
user_rated_movies = [
    {"title": "Mad Max: Fury Road", "year": 2015, "rating": 0.5},
    {"title": "Pride and Prejudice", "year": 2003, "rating": 1.5},
    {"title": "La La Land", "year": 2016, "rating": 2.0}
]
cinema_movies = [
    {"title": "Black Panther", "year": 2017},
    {"title": "Avengers: Infinity War - Part I", "year": 2018},
    {"title": "A Star Is Born", "year": 2018},
    {"title": "Bohemian Rhapsody", "year": 2018},
    {"title": "Deadpool 2", "year": 2018},
    {"title": "Spider-Man: Into the Spider-Verse", "year": 2018},
    {"title": "Crazy Rich Asians", "year": 2018},
    {"title": "Mission: Impossible - Fallout", "year": 2018},
    {"title": "Solo: A Star Wars Story", "year": 2018},
    {"title": "Aquaman", "year": 2018},
    {"title": "The Meg", "year": 2018},
    {"title": "Ready Player One", "year": 2018},
    {"title": "Jurassic World: Fallen Kingdom", "year": 2018}
]

# Funktion aufrufen
recommended_cinema_movies = recommend_movies(user_rated_movies, cinema_movies)
for movie in recommended_cinema_movies:
    print(f"Title: {movie['title']}, Year: {movie['year']}, Similarity: {movie['similarity']:.4f}")

Title: Bohemian Rhapsody, Year: 2018, Similarity: 0.2294
Title: Aquaman, Year: 2018, Similarity: 0.3974
Title: Crazy Rich Asians, Year: 2018, Similarity: 0.4588
Title: The Meg, Year: 2018, Similarity: 0.4588
Title: A Star Is Born, Year: 2018, Similarity: 0.4867
Title: Ready Player One, Year: 2018, Similarity: 0.5298
Title: Spider-Man: Into the Spider-Verse, Year: 2018, Similarity: 0.5735
Title: Solo: A Star Wars Story, Year: 2018, Similarity: 0.5735
Title: Black Panther, Year: 2017, Similarity: 0.6623
Title: Avengers: Infinity War - Part I, Year: 2018, Similarity: 0.6623
Title: Deadpool 2, Year: 2018, Similarity: 0.6623
Title: Mission: Impossible - Fallout, Year: 2018, Similarity: 0.6623
Title: Jurassic World: Fallen Kingdom, Year: 2018, Similarity: 0.7182
