In [1]:
import pandas as pd


In [6]:
df_movies = pd.read_csv("../data/TMDB_movie_dataset_v11.csv")

In [7]:
df_shows = pd.read_csv("../data/TMDB_tv_dataset_v3.csv")

Movie dataset preprocessing

In [8]:
df_movies.set_index("id", inplace=True)

In [9]:
df_movies.rename(columns={"production_countries":"origin_country"}, inplace=True)

In [10]:
#dropping features where the data is not available. Because I'm using them in the model they can't be filled with a default value
drop_na = ["title","vote_average","vote_count","popularity","genres","overview","keywords"]
df_movies = df_movies.dropna(subset=drop_na)

In [11]:
def determine_type(row):
    origin = row.get('origin_country', '')  # example: "JP", "US", etc.
    genres = row.get('genres', [])
        
    # Check only for 'JP' as origin country
    if origin == None :
        return 'Classic'
    elif origin == 'Japan' and 'Animation' in genres:
        return 'Anime'
    else:
        return 'Classic'

# Create new column "type"
df_movies['content_category'] = df_movies.apply(determine_type, axis=1)

In [12]:
def normalize_genres(genres):
    if isinstance(genres, str):
        return [g.strip() for g in genres.split(",")]
    return genres
df_movies["genres"] = df_movies["genres"].apply(normalize_genres)

In [None]:
df_movies[df_movies["content_category"] == "Anime"]

In [14]:
movie_features = ["title","vote_average","vote_count","popularity","keywords","genres","overview","origin_country","content_category"]
df_movie_features = df_movies[movie_features]

Show dataset preprocessing

In [15]:
df_shows.set_index("id", inplace=True)

In [None]:
df_shows.columns

In [17]:
df_shows.rename(columns={"name":"title"}, inplace=True)

In [18]:
drop_na = ["title","vote_average","vote_count","popularity","genres","overview"]

In [19]:
df_shows = df_shows.dropna(subset=drop_na)

In [20]:
import re

def clean_genres(genre_str):
    if not isinstance(genre_str, str):
        return []
    
    # Replace " & " with ", " to split combined genres
    genre_str = re.sub(r'\s*&\s*', ', ', genre_str)

    # Split by comma and strip spaces
    genres_list = [genre.strip() for genre in genre_str.split(",") if genre.strip()]
    
    return genres_list

In [21]:
df_shows["genres"] = df_shows["genres"].apply(clean_genres)

In [22]:
def determine_type(row):
    origin = row.get('origin_country', '')  # example: "JP", "US", etc.
    genres = row.get('genres', [])
        
    # Check only for 'JP' as origin country
    if origin == None :
        return 'Classic'
    elif origin == 'JP' and 'Animation' in genres:
        return 'Anime'
    else:
        return 'Classic'

# Create new column "type"
df_shows['content_category'] = df_shows.apply(determine_type, axis=1)

In [23]:
show_features = ["title","vote_average","vote_count","popularity","genres","overview","origin_country","content_category"]
df_show_features = df_shows[show_features]

In [None]:
df_show_features["genres"]

Merge 2 feature datasets

In [None]:
df_show_features["type"] = "TV"
df_movie_features["type"] = "Movie"
merged_feature_df = pd.concat([df_movie_features, df_show_features],axis=0)

In [72]:
merged_feature_df.to_pickle("../data./merged_feature_df.pkl")