The goal of this notebook is to experiment with filtering the reviewed entries on the original
datasets (e.g. movies on MovieLens) based on their genres and other characteristics.

In [1]:
import polars as pl

MOVIE_LENS_DATA = "../data/original_datasets/MovieLens.csv"

In [2]:
df = pl.read_csv(MOVIE_LENS_DATA)
df = df.select(["title", "genre", "runtime_minutes", "year"]).unique("title")
df

title,genre,runtime_minutes,year
str,str,str,str
"""Third Man, The (1949)""","""Mystery|Thriller""","""Long""","""40s"""
"""Horse Whisperer, The (1998)""","""Drama""","""Long""","""90s"""
"""All Dogs Go to Heaven 2 (1996)""","""Animation|Children's|Musical""","""Long""","""90s"""
"""Birdy (1984)""","""Drama|War""","""Long""","""80s"""
"""Stir of Echoes (1999)""","""Thriller""","""Long""","""90s"""
…,…,…,…
"""What Dreams May Come (1998)""","""Drama|Romance""","""Long""","""90s"""
"""Pocahontas (1995)""","""Animation|Children's|Musical|R…","""Long""","""90s"""
"""Beyond Silence (1996)""","""Drama""","""Long""","""90s"""
"""Outside Ozona (1998)""","""Drama|Thriller""","""Long""","""90s"""


In [3]:
#pattern = r"\((\d{4})\)"
#df = df.with_columns(
#    pl.col("title").str.extract(pattern, 1).cast(pl.Int64).alias("year")
#)
#df = df.with_columns(
#    pl.col("title").str.replace(r"\s*\(\d{4}\)", "").alias("title")
#)
#df

In [4]:
def filter_movies(df: pl.DataFrame, attributes: dict[str, str]) -> pl.DataFrame:
    filtered_df = df
    if "year" in attributes:
        filtered_df = filtered_df.filter(pl.col("year") == attributes["year"])
    if "runtime_minutes" in attributes:
        filtered_df = filtered_df.filter(pl.col("runtime_minutes") == attributes["runtime_minutes"])
    if "genre" in attributes:
        filtered_df = filtered_df.filter(pl.col("genre").str.contains(attributes["genre"]))
    return filtered_df

filtered_df = filter_movies(df, {"genre": "Comedy", "year": "90s"})
filtered_df.select(["title", "genre"])

title,genre
str,str
"""Emma (1996)""","""Comedy|Drama|Romance"""
"""Mickey Blue Eyes (1999)""","""Comedy|Romance"""
"""Patch Adams (1998)""","""Comedy|Drama"""
"""It's in the Water (1998)""","""Comedy"""
"""Waterboy, The (1998)""","""Comedy"""
…,…
"""Mystery Men (1999)""","""Action|Adventure|Comedy"""
"""Almost Heroes (1998)""","""Adventure|Comedy"""
"""Lay of the Land, The (1997)""","""Comedy|Drama"""
"""Smoke Signals (1998)""","""Comedy|Drama"""


In [5]:
BOOK_CROSSING_DATA = "../data/original_datasets/BookCrossing.csv"

columns_of_interest = ["book_title", "book_author", "year_of_publication", "language", "category", "country"]
df = pl.read_csv(BOOK_CROSSING_DATA).unique("book_id").select(columns_of_interest)
df

book_title,book_author,year_of_publication,language,category,country
str,str,str,str,str,str
"""The Ghost at Dawn's House (Bab…","""Ann M. Martin""","""90s""","""en""","""Juvenile Fiction""","""usa"""
"""Prevention's Healing With Vita…","""Alice Feinstein""","""90s""","""en""","""other""","""usa"""
"""The Russia House""","""John Le Carre""","""2000s""","""en""","""Fiction""","""australia"""
"""Kutath (The Faded Sun, Book 3)""","""C.J. Cherryh""","""90s""","""en""","""other""","""usa"""
"""The Best American Crime Writin…","""OTTO PENZLER""","""2000s""","""en""","""Social Science""","""usa"""
…,…,…,…,…,…
"""Making the words stand still""","""Donald E Lyman""","""80s""","""en""","""other""","""usa"""
"""Les Adieux Ã?Â la reine - Pri…","""Chantal Thomas""","""2000s""","""fr""","""other""","""switzerland"""
"""The Apocrypha: An American Tra…","""Edgar Johnson Goodspeed""","""80s""","""en""","""Religion""","""usa"""
"""The Penguin Book of Australian…","""Harry Heseltine""","""90s""","""en""","""Fiction""","""australia"""


In [6]:
def filter_books(df: pl.DataFrame, attributes: dict[str, str]) -> pl.DataFrame:
    filtered_df = df
    if "year_of_publication" in attributes:
        filtered_df = filtered_df.filter(pl.col("year_of_publication") == attributes["year_of_publication"])
    if "language" in attributes:
        filtered_df = filtered_df.filter(pl.col("language") == attributes["language"])
    if "country" in attributes:
        filtered_df = filtered_df.filter(pl.col("country") == attributes["country"])
    if "category" in attributes:
        if attributes["category"] == "body-mind-and-spirit":
            filtered_df = filtered_df.filter(pl.col("category") == "Body, Mind & Spirit")
        else:
            filtered_df = filtered_df.filter(pl.col("category") == attributes["category"])
    return filtered_df

filtered_df = filter_books(df, {"year_of_publication": "90s", "language": "en", "category": "body-mind-and-spirit"})
filtered_df

book_title,book_author,year_of_publication,language,category,country
str,str,str,str,str,str
"""Ancient Ways: Reclaiming Pagan…","""Pauline Campanelli""","""90s""","""en""","""Body, Mind & Spirit""","""canada"""
"""Hidden Mysteries: Ets, Ancient…","""Joshua D. Stone""","""90s""","""en""","""Body, Mind & Spirit""","""united kingdom"""
"""The Complete Illustrated Guide…","""Inge Dougans""","""90s""","""en""","""Body, Mind & Spirit""","""usa"""
"""Astrology for the Millions (Ll…","""Lewi Grant""","""90s""","""en""","""Body, Mind & Spirit""","""usa"""
"""When Time Began: : Book V of t…","""Zecharia Sitchin""","""90s""","""en""","""Body, Mind & Spirit""","""usa"""
…,…,…,…,…,…
"""Abduction: Human Encounters Wi…","""John E. Mack""","""90s""","""en""","""Body, Mind & Spirit""","""usa"""
"""Day Of Deception""","""John C. Hagee""","""90s""","""en""","""Body, Mind & Spirit""","""usa"""
"""The Sorcerers' Crossing (Arkan…","""Taisha Abelar""","""90s""","""en""","""Body, Mind & Spirit""","""usa"""
"""Enochian Magic: A Practical Ma…","""Gerald J. Schueler""","""90s""","""en""","""Body, Mind & Spirit""","""romania"""


In [22]:
import pathlib
import polars as pl


class EntryBrowser:
    __DATA_PATH = pathlib.Path("../data/original_datasets/")
    __MOVIE_DATA = __DATA_PATH / "MovieLens.csv"
    __BOOK_DATA = __DATA_PATH / "BookCrossing.csv"


    movie_df: pl.DataFrame
    book_df: pl.DataFrame


    def __init__(self):
        self.movie_df = pl.read_csv(EntryBrowser.__MOVIE_DATA) \
            .select(["title", "genre", "runtime_minutes", "year"]).unique("title")
        self.book_df = pl.read_csv(EntryBrowser.__BOOK_DATA).unique("book_id") \
            .select(["book_title", "book_author", "year_of_publication", "language", "category", "country"])
 

    def filter_movies(self, attributes: dict[str, str]) -> pl.DataFrame:
        df = self.movie_df
        if "year" in attributes:
            df = df.filter(pl.col("year") == attributes["year"])
        if "runtime_minutes" in attributes:
            df = df.filter(pl.col("runtime_minutes") == attributes["runtime_minutes"])
        if "genre" in attributes:
            df = df.filter(pl.col("genre").str.contains(attributes["genre"]))
        return df
    

    def movie_attributes(self) -> dict[str, list[str]]:
        df = self.movie_df
        cols = [c for c in df.columns if c != "title"]
        attributes = {}

        for c in cols:
            attributes[c] = df.unique(c)[c]

        genres = set()
        for genre_list in attributes["genre"]:
            genre_iterator: list[str] = genre_list.split("|")
            for g in genre_iterator:
                genres.add(g)
        attributes["genre"] = genres
        
        return attributes
    

    def filter_books(self, attributes: dict[str, str]) -> pl.DataFrame:
        df = self.book_df
        if "year_of_publication" in attributes:
            df = df.filter(pl.col("year_of_publication") == attributes["year_of_publication"])
        if "language" in attributes:
            df = df.filter(pl.col("language") == attributes["language"])
        if "country" in attributes:
            df = df.filter(pl.col("country") == attributes["country"])
        if "category" in attributes:
            if attributes["category"] == "body-mind-and-spirit":
                df = df.filter(pl.col("category") == "Body, Mind & Spirit")
            else:
                df = df.filter(pl.col("category") == attributes["category"])
        return df
    
    def book_attributes(self) -> dict[str, list[str]]:
        df = self.book_df.unique("book_title")
        cols = [c for c in df.columns if c != "book_title" and c != "book_author"]
        
        attributes = {}
        for c in cols:
            attributes[c] = df.unique(c)[c]
        return attributes

In [23]:
entry_browser = EntryBrowser()
entry_browser.movie_df.unique("year")

title,genre,runtime_minutes,year
str,str,str,str
"""Goofy Movie, A (1995)""","""Animation|Children's|Comedy|Ro…","""Long""","""90s"""
"""Peter Pan (1953)""","""Animation|Children's|Fantasy|M…","""Long""","""50s"""
"""Tigerland (2000)""","""Drama""","""Long""","""2000s"""
"""Farmer's Wife, The (1928)""","""Comedy""","""Long""","""20s"""
"""Cat Ballou (1965)""","""Comedy|Western""","""Long""","""60s"""
"""Dog's Life, A (1920)""","""Comedy""","""Short""","""10s"""
"""Autopsy (Macchie Solari) (1975…","""Horror""","""Long""","""70s"""
"""Impact (1949)""","""Crime|Drama""","""Long""","""40s"""
"""Alligator (1980)""","""Action|Horror|Sci-Fi""","""Long""","""80s"""
"""Elstree Calling (1930)""","""Comedy|Musical""","""Long""","""30s"""


In [24]:
entry_browser.movie_attributes()

{'genre': {'Action',
  'Adventure',
  'Animation',
  "Children's",
  'Comedy',
  'Crime',
  'Documentary',
  'Drama',
  'Fantasy',
  'Film-Noir',
  'Horror',
  'Musical',
  'Mystery',
  'Romance',
  'Sci-Fi',
  'Thriller',
  'War',
  'Western'},
 'runtime_minutes': shape: (3,)
 Series: 'runtime_minutes' [str]
 [
 	"Long"
 	"Very Long"
 	"Short"
 ],
 'year': shape: (10,)
 Series: 'year' [str]
 [
 	"70s"
 	"20s"
 	"80s"
 	"10s"
 	"30s"
 	"60s"
 	"2000s"
 	"90s"
 	"50s"
 	"40s"
 ]}

In [29]:
def book_attributes(df: pl.DataFrame) -> dict[str, list[str]]:
    df = df.unique("book_title")
    cols = [c for c in df.columns if c != "book_title" and c != "book_author"]
    
    attributes = {}
    for c in cols:
        attributes[c] = df.unique(c)[c]
    return attributes

book_attributes(entry_browser.book_df)

{'year_of_publication': shape: (13,)
 Series: 'year_of_publication' [str]
 [
 	"10s"
 	"70s"
 	"1900s"
 	"30s"
 	"40s"
 	…
 	"20s"
 	"2000s"
 	"90s"
 	"1300s"
 	"80s"
 ],
 'language': shape: (9,)
 Series: 'language' [str]
 [
 	"pt"
 	"en"
 	"es"
 	"fr"
 	"it"
 	"other"
 	"nl"
 	"de"
 	"da"
 ],
 'category': shape: (30,)
 Series: 'category' [str]
 [
 	"Nature"
 	"Poetry"
 	"Health & Fitness"
 	"Juvenile Nonfiction"
 	"True Crime"
 	…
 	"Juvenile Fiction"
 	"Fiction"
 	"Self-Help"
 	"Religion"
 	"Business & Economics"
 ],
 'country': shape: (28,)
 Series: 'country' [str]
 [
 	"italy"
 	"spain"
 	"germany"
 	"canada"
 	"norway"
 	…
 	"austria"
 	"brazil"
 	"finland"
 	"romania"
 	"dominican republic"
 ]}