This notebook's purpose is to try and represent each added group in a different row of a DataFrame

In [2]:
import polars as pl
import utils
utils.mount_src()

from hypothesis_row import HypothesisRow
from group import Group


DATA = "../data/movie_lens.csv"
USERS_DATA = "../data/user_counts/movie_lens.csv"

In [29]:
df = pl.read_csv(DATA)
user_df = pl.read_csv(USERS_DATA)
hs = HypothesisRow.df_to_list(df, user_df)

In [None]:
def pipeline_dataframe(hs: list[HypothesisRow]) -> pl.DataFrame:
    rows: list[dict] = []
    all_attributes = set()

    for h in hs:
        for c in h.children:
            attrs = {}
            for k, v in c.attributes.items():
                attrs[k] = v
            rows.append(attrs)
    for r in rows:
        for k in r:
            all_attributes.add(k)

    for r in rows:
        for a in all_attributes:
            if a not in r:
                r[a] = None

    return pl.DataFrame(rows)
pl_df = pipeline_dataframe(hs)
pl_df

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib.colors import to_hex

df = pl_df.to_pandas()
unique_values = df.stack().unique()
palette = sns.color_palette("hsv", len(unique_values))
color_map = {
    val: f"background-color: {to_hex(color)}"
    for val, color in zip(unique_values, palette)
}
color_map

In [None]:
def style_nominal_values(color_map: dict, val: str):
    return color_map.get(val, "")

styled_df = df.style.applymap(lambda val: style_nominal_values(color_map, val))
styled_df

In [31]:
def generate_nominal_color_map(series: pd.Series) -> dict:
    unique_values = series.unique()
    palette = sns.color_palette("hsv", len(unique_values))
    return {val: f"background-color: {to_hex(color)}" for val, color in zip(unique_values, palette)}


def style_nominal_values(color_map: dict, val: str):
    return color_map.get(val, "")


def style_ordinal_values(val, min_val, max_val):
    norm = (val - min_val) / (max_val - min_val)
    color = sns.light_palette("blue", as_cmap=True)(norm)
    return f"background-color: {to_hex(color)}"

In [None]:
nominal_cols = ["genre", "gender", "location", "occupation"]

df = pl_df.to_pandas()
styled_df = df.style
for c in nominal_cols:
    color_map = generate_nominal_color_map(df[c])
    styled_df = styled_df.map(lambda val: style_nominal_values(color_map, val), subset=c)
styled_df

In [None]:
sorting_opts = ["runtime_minutes", "age"]

df = pl_df.to_pandas()

def sort_df(df: pd.DataFrame, sort_by: list[str]) -> pd.DataFrame:
    return df.sort_values(by=sort_by, na_position="first")

sort_df(df, sorting_opts)

In [None]:
df['age'].unique()

In [58]:
def sort_age(ages: pd.Series) -> pd.Series:
    return ages.map({
        None: 0,
        "<18": 1,
        "18-24": 2,
        "25-34": 3,
        "35-44": 4,
        "45-49": 5,
        "50-55": 6,
        ">56": 7
    })


def sort_runtime(runtimes: pd.Series) -> pd.Series:
    return runtimes.map({
        None: 0,
        "Short": 1,
        "Long": 2
    })


def sort_year(years: pd.Series) -> pd.Series:
    def map_elements(x):
        if not x:
            return 0
        elif x == "2000s":
            return 10
        else:
            decade = int(x.replace("s", ""))
            return decade // 10
    return years.map(map_elements)

In [None]:
pd.set_option("display.max_rows", None)
df.sort_values(by="year", na_position="first", key=sort_year)

In [65]:
def sort_year_of_publication(pub_years: pd.Series) -> pd.Series:
    def convert_elements(el) -> int:
        if not el:
            return 0
        n = int(el.replace("s", ""))
        return n
    return pub_years.map(convert_elements)

In [None]:
df = pl.read_csv("../data/book_crossing.csv")
user_df = pl.read_csv("../data/user_counts/book_crossing.csv")
hs = HypothesisRow.df_to_list(df, user_df)
df = pipeline_dataframe(hs).to_pandas()

sorting_opts = ["language", "age"]
#sorted = df.sort_values(by="year_of_publication", na_position="first", key=sort_year_of_publication)
diff = df.columns.difference(sorting_opts)
df[sorting_opts + diff.to_list()]

In [68]:
def sort_fans(fans: pd.Series) -> pd.Series:
    return fans.map({
        None: 0,
        "unpopular": 1,
        "semipopular": 2,
        "popular": 3
    })

In [None]:
df = pl.read_csv("../data/yelp.csv")
user_df = pl.read_csv("../data/user_counts/yelp.csv")
hs = HypothesisRow.df_to_list(df, user_df)
df = pipeline_dataframe(hs).to_pandas()
df.sort_values(by="fans", key=sort_fans)