This notebook's objective is to find a way to count the number of users in a certain group
in the original datasets so that the areas in the treemaps can reflect that.

In [31]:
import polars as pl
from utils import mount_src

mount_src()

import hypothesis_row as hr

DATA_DIR = "../data/original_datasets/"
BOOK_CROSSING_DATA = DATA_DIR + "BookCrossing.csv"
MOVIE_LENS_DATA = DATA_DIR + "MovieLens.csv"
YELP_DATA = DATA_DIR + "Yelp.csv"

BOOK_CROSSING_USER_ATTRIBUTES = ["user_id", "age"]
MOVIE_LENS_USER_ATTRIBUTES = ["user_id", "gender", "age", "occupation", "location"]
YELP_USER_ATTRIBUTES = ["user_id", "gender"]

In [2]:
df = pl.read_csv(BOOK_CROSSING_DATA)
unique_users_df = df.select(pl.col("user_id"), pl.col("age")).unique()
unique_users_df

user_id,age
i64,str
26740,"""25-34"""
43677,"""25-34"""
40986,"""25-34"""
47135,"""35-44"""
37693,"""25-34"""
…,…
16547,"""50-55"""
71539,"""45-49"""
33741,"""25-34"""
65437,"""18-24"""


In [3]:
groups = {}
for r in unique_users_df.iter_rows():
    id, age = r
    if age not in groups:
        groups[age] = 1
    else:
        groups[age] += 1
groups

{'25-34': 37855,
 '35-44': 8305,
 '18-24': 5914,
 '>56': 3988,
 '<18': 2500,
 '50-55': 3247,
 '45-49': 2994}

So this is working for the Book Crossing dataset,
which is good, but not excellent, since it's a dataset with a single
user attribute (age).
Let's try it with the MovieLens dataset.

In [21]:
df = pl.read_csv(MOVIE_LENS_DATA)
unique_users = df.select(MOVIE_LENS_USER_ATTRIBUTES).unique()
unique_users

user_id,gender,age,occupation,location
i64,str,str,str,str
5786,"""M""","""25-34""","""doctor-health care""","""WI"""
34,"""F""","""18-24""","""other""","""MA"""
5468,"""M""","""25-34""","""programmer""","""CA"""
1182,"""M""","""<18""","""K-12 student""","""CA"""
4652,"""F""","""25-34""","""executive-managerial""","""CA"""
…,…,…,…,…
5971,"""M""","""35-44""","""executive-managerial""","""MI"""
1172,"""F""","""25-34""","""other""","""IL"""
1448,"""F""","""25-34""","""clerical-admin""","""PA"""
917,"""M""","""50-55""","""doctor-health care""","""CA"""


In [22]:
def extract_unique_features(df: pl.DataFrame, col: str) -> list:
    return df.select(col).to_series().unique().to_list()

occupations = extract_unique_features(unique_users, "occupation")
genders = extract_unique_features(unique_users, "gender")
ages = extract_unique_features(unique_users, "age")
locations = extract_unique_features(unique_users, "location")

In [29]:
len(occupations) * len(genders) * len(ages) * len(locations)

16464

Since there's a lot of different possibilities,
I'll try and get the user count *ad hoc*.

In [42]:
h_df = pl.read_csv("../data/movie_lens.csv")
hs = hr.HypothesisRow.df_to_list(h_df)
hs[13].parent

{'age': '18-24',
 'gender': 'M',
 'genre': 'Drama',
 'occupation': 'college-grad student',
 'runtime_minutes': 'Long',
 'year': '90s'}

In [40]:
def get_count_for_attribute(df: pl.DataFrame, col: str, val: str) -> int:
    return len(df.select(col).filter(pl.col(col) == val))

get_count_for_attribute(unique_users, "gender", "F")

1709

In [54]:
from polars.exceptions import ColumnNotFoundError


def get_user_count(df: pl.DataFrame, group: dict) -> int:
    """
    Gets user count for a certain group.

    Args:
        df (pl.DataFrame): DataFrame containing unique users.
        
        groups (dict): attribute dictionary for a group.

    Returns:
        int: user count for that group.
    """
    filtered_df = df
    for k, v in group.items():
        try: 
            filtered_df = filtered_df.filter(pl.col(k) == v)
        except ColumnNotFoundError:
            continue
    return len(filtered_df)

get_user_count(unique_users, hs[13].parent)

371

In [56]:
tmp_df = unique_users.filter(pl.col("gender") == "M")
tmp_df = tmp_df.filter(pl.col("age") == "18-24")
tmp_df = tmp_df.filter(pl.col("occupation") == "college-grad student")
tmp_df

user_id,gender,age,occupation,location
i64,str,str,str,str
787,"""M""","""18-24""","""college-grad student""","""TX"""
2083,"""M""","""18-24""","""college-grad student""","""N/A"""
4949,"""M""","""18-24""","""college-grad student""","""MN"""
3069,"""M""","""18-24""","""college-grad student""","""OR"""
3844,"""M""","""18-24""","""college-grad student""","""OK"""
…,…,…,…,…
1081,"""M""","""18-24""","""college-grad student""","""N/A"""
3659,"""M""","""18-24""","""college-grad student""","""SC"""
3033,"""M""","""18-24""","""college-grad student""","""RI"""
5891,"""M""","""18-24""","""college-grad student""","""MN"""


In [64]:
book_df = pl.read_csv(BOOK_CROSSING_DATA)
book_df.select("user_id", "age").unique()
get_user_count(book_df.select("user_id", "age").unique(), {"age": "35-44"})

8305

In [63]:
book_df.select("user_id", "age").unique()

user_id,age
i64,str
65550,"""25-34"""
35687,"""25-34"""
54052,"""25-34"""
21056,"""25-34"""
16924,"""35-44"""
…,…
34470,"""25-34"""
59375,"""25-34"""
3569,"""18-24"""
48192,"""25-34"""


Since everything seems to be working, we'll now save the CSVs with unique users.

In [70]:
movie_df = pl.read_csv(MOVIE_LENS_DATA)
yelp_df = pl.read_csv(YELP_DATA)
book_df = pl.read_csv(BOOK_CROSSING_DATA)

movie_users = movie_df.select(MOVIE_LENS_USER_ATTRIBUTES).unique()
yelp_users = yelp_df.select(YELP_USER_ATTRIBUTES).unique()
book_users = book_df.select(BOOK_CROSSING_USER_ATTRIBUTES).unique()

USER_COUNT_PATH = "../data/user_counts/"

movie_users.write_csv(USER_COUNT_PATH + "movie_lens.csv")
yelp_users.write_csv(USER_COUNT_PATH + "yelp.csv")
book_users.write_csv(USER_COUNT_PATH + "book_crossing.csv")