The goal of this notebook is to experiment with filtering the reviewed entries on the original
datasets (e.g. movies on MovieLens) based on their genres and other characteristics.

In [39]:
import polars as pl

MOVIE_LENS_DATA = "../data/original_datasets/MovieLens.csv"

In [None]:
df = pl.read_csv(MOVIE_LENS_DATA)
df = df.select(["title", "genre", "runtime_minutes", "year"]).unique("title")
df

In [None]:
#pattern = r"\((\d{4})\)"
#df = df.with_columns(
#    pl.col("title").str.extract(pattern, 1).cast(pl.Int64).alias("year")
#)
#df = df.with_columns(
#    pl.col("title").str.replace(r"\s*\(\d{4}\)", "").alias("title")
#)
#df

In [None]:
def filter_movies(df: pl.DataFrame, attributes: dict[str, str]) -> pl.DataFrame:
    filtered_df = df
    movie_attributes = ["year", "runtime_minutes", "genre"]
    if "year" in attributes:
        filtered_df = filtered_df.filter(pl.col("year") == attributes["year"])
    if "runtime_minutes" in attributes:
        filtered_df = filtered_df.filter(pl.col("runtime_minutes") == attributes["runtime_minutes"])
    if "genre" in attributes:
        filtered_df = filtered_df.filter(pl.col("genre").str.contains(attributes["genre"]))
    return filtered_df

filtered_df = filter_movies(df, {"genre": "Comedy", "year": "90s"})
filtered_df.select(["title", "genre"])

In [None]:
BOOK_CROSSING_DATA = "../data/original_datasets/BookCrossing.csv"

columns_of_interest = ["book_title", "book_author", "year_of_publication", "language", "category", "country"]
df = pl.read_csv(BOOK_CROSSING_DATA).unique("book_id").select(columns_of_interest)
df

In [None]:
def filter_books(df: pl.DataFrame, attributes: dict[str, str]) -> pl.DataFrame:
    filtered_df = df
    if "year_of_publication" in attributes:
        filtered_df = filtered_df.filter(pl.col("year_of_publication") == attributes["year_of_publication"])
    if "language" in attributes:
        filtered_df = filtered_df.filter(pl.col("language") == attributes["language"])
    if "country" in attributes:
        filtered_df = filtered_df.filter(pl.col("country") == attributes["country"])
    if "category" in attributes:
        if attributes["category"] == "body-mind-and-spirit":
            filtered_df = filtered_df.filter(pl.col("category") == "Body, Mind & Spirit")
        else:
            filtered_df = filtered_df.filter(pl.col("category") == attributes["category"])
    return filtered_df

filtered_df = filter_books(df, {"year_of_publication": "90s", "language": "en", "category": "body-mind-and-spirit"})
filtered_df

In [None]:
YELP_DATA = "../data/original_datasets/Yelp.csv"

df = pl.read_csv(YELP_DATA).unique("business_id").select([""])
df.columns