In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import os
os.chdir(os.path.dirname(os.getcwd()))

import pandas as pd
from resources.constants import *

def density_of_transactions(num_transactions, num_customers, num_articles):
    return num_transactions / (num_customers * num_articles)

HNM_DATASET_PATH = r"resources\other_datasets\HnM_dataset\transactions_train.csv"
NETFLIX_PRIZE_COMPETITION_PATH = r"resources\other_datasets\Netflix_prize_competition"
RENT_THE_RUNWAY_DATASET_PATH = r"resources\other_datasets\renttherunway_final_dataset\renttherunway_final_data.json"

In [None]:
rtr_df = pd.read_json(RENT_THE_RUNWAY_DATASET_PATH, lines=True)
rtr_df.head(2)

In [None]:
rtr_df["user_id"].value_counts().describe(percentiles=[0.25, 0.5, 0.6, 0.68, 0.7, 0.75, 0.9, 0.95, 0.99])

In [None]:
from decimal import Decimal

num_articles = rtr_df["item_id"].nunique()
num_customers = rtr_df["user_id"].nunique()
num_transactions = rtr_df.shape[0]
print(num_articles, num_customers, num_transactions)

rtr_density = density_of_transactions(num_transactions, num_customers, num_articles)
print(f"Density of transactions: {Decimal(rtr_density):.2E}, Number of users: {Decimal(num_customers)}, Number of outfits: {Decimal(num_articles)}, Number of orders: {Decimal(num_transactions)}")

In [None]:
hnm_df = pd.read_csv(HNM_DATASET_PATH)

In [None]:
hnm_df = hnm_df.drop_duplicates(subset=["article_id", "customer_id"])

In [None]:
num_articles = hnm_df["article_id"].nunique()
num_customers = hnm_df["customer_id"].nunique()
num_transactions = hnm_df.shape[0]
print(num_articles, num_customers, num_transactions)

hnm_density = density_of_transactions(num_transactions, num_customers, num_articles)
hnm_density

In [None]:
from tqdm.notebook import tqdm

#Netflix prize competition

# data_paths = [os.path.join(NETFLIX_PRIZE_COMPETITION_PATH, file) for file in os.listdir(NETFLIX_PRIZE_COMPETITION_PATH)]
# netflix_df = pd.read_csv(data_paths[0], header=None, names=["customer_id", "rating", "date"])
# for path in tqdm(data_paths[1:]):
#     netflix_df = pd.concat([netflix_df, pd.read_csv(path, header=None, names=["customer_id", "rating", "date"])])
# tqdm.pandas()
# current_movie_id = -1

# def parse_dataset_movies(customer_id, rating, data):
#     global current_movie_id

#     if customer_id.endswith(":"):
#         current_movie_id = int(customer_id[:-1])
#         return None
#     else:
#         return current_movie_id

# netflix_df["movie_id"] = netflix_df.progress_apply(lambda row: parse_dataset_movies(row["customer_id"], row["rating"], row["date"]), axis=1)
# netflix_df.to_pickle(os.path.join(NETFLIX_PRIZE_COMPETITION_PATH, "netflix_df.gz"), compression="gzip")

In [None]:
netflix_df = pd.read_pickle(os.path.join(NETFLIX_PRIZE_COMPETITION_PATH, "netflix_df.gz"), compression="gzip")
netflix_df = netflix_df.dropna()

In [None]:
netflix_value_counts = netflix_df["customer_id"].value_counts()
netflix_value_counts

In [None]:
netflix_value_counts.describe()

In [None]:
num_movies = netflix_df["movie_id"].nunique()
num_customers = netflix_df["customer_id"].nunique()
num_ratings = netflix_df.shape[0]
num_movies, num_customers, num_ratings

In [None]:
from decimal import Decimal
netflix_density = density_of_transactions(num_ratings, num_customers, num_movies)
print(f"Density of transactions: {Decimal(netflix_density):.2E}, Number of movies: {Decimal(num_movies)}, Number of customers: {Decimal(num_customers)}, Number of ratings: {Decimal(num_ratings)}")

In [None]:
netflix_density

In [None]:

triplets_df = pd.read_csv(USER_ACTIVITY_TRIPLETS_CSV_PATH, sep=CSV_SEPARATOR)
outfits_df = pd.read_csv(OUTFITS_CSV_PATH, sep=CSV_SEPARATOR)

# Append orders from before 2020 to the evaluated dataset
original_orders_df = pd.read_csv(ORIGINAL_ORDERS_CSV_PATH, sep=CSV_SEPARATOR)
triplets_df = pd.concat([triplets_df, original_orders_df], ignore_index=True)

In [None]:
triplets_df = triplets_df.drop_duplicates(subset=["customer.id", "outfit.id"]).copy()

In [None]:
num_users = triplets_df["customer.id"].nunique()
num_outfits = triplets_df["outfit.id"].nunique()
num_orders = triplets_df.shape[0]
num_users, num_outfits, num_orders

In [None]:
from decimal import Decimal

vibrent_density = density_of_transactions(num_orders, num_users, num_outfits)
print(f"Density of transactions: {Decimal(vibrent_density):.2E} ({vibrent_density}), Number of users: {Decimal(num_users)}, Number of outfits: {Decimal(num_outfits)}, Number of orders: {Decimal(num_orders)}")

In [None]:
outfit_group_dict = outfits_df[["id", "group"]].set_index("id").to_dict()["group"]

In [None]:
triplets_df["outfit_group"] = triplets_df["outfit.id"].map(outfit_group_dict)

In [None]:
triplets_df = triplets_df.drop_duplicates(subset=["customer.id", "outfit_group"]).copy()

In [None]:
num_users = triplets_df["customer.id"].nunique()
num_outfits = triplets_df["outfit_group"].nunique()
num_orders = triplets_df.shape[0]
num_users, num_outfits, num_orders

In [None]:
vibrent_density = density_of_transactions(num_orders, num_users, num_outfits)
print(f"Density of transactions: {Decimal(vibrent_density):.2E} ({vibrent_density}), Number of users: {Decimal(num_users)}, Number of outfits: {Decimal(num_outfits)}, Number of orders: {Decimal(num_orders)}")

In [None]:
rtr_density

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

DENSITIES = {
    "H&M Fashion": 0.00019172855246088317,
    "Netflix Prize": 0.01177557662406687,
    "Clothing Rental Groups": 0.0011599473470511846,
    "Clothing Rental Individual": 0.0007687068402113402,
    "Rent The Runway": 0.0003117665293831097,
    "Goodreads": 1.6260964205629808e-05,
    "Amazon Fashion": 6.3e-06,
    "Book Rental": 3.4083453369004934e-05
}

densities_df = pd.DataFrame.from_dict(DENSITIES, orient="index", columns=["density"])
densities_df.sort_values(by="density", ascending=False, inplace=True)
densities_df.index.name = "dataset"
densities_df = densities_df.reset_index()

ax = densities_df.plot(kind="bar", x="dataset", y="density", title="Density of datasets", color="skyblue", legend=False)
#ax.set_ylim(0, max(DENSITIES.values()) + 0.0015)
ax.set_ylabel("Density")
ax.set_xlabel("")
ax.set_yscale("log")
ax.set_ylim(1e-6, 1e-1)
ax.set_xticklabels(densities_df["dataset"], rotation=45)

# Annotate the value of each bar
for i in ax.patches:
    ax.annotate(format(i.get_height(), '.6f'), 
                (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 10), 
                textcoords = 'offset points')

plt.savefig("reports/figures/density_of_datasets.pdf", format="pdf", bbox_inches="tight")

plt.tight_layout()
plt.show()


In [None]:
# Start of the LaTeX table
latex_table = r"""
\begin{table}[h]
\centering
\begin{tabular}{|l|c|}
\hline
\textbf{Dataset} & \textbf{Density} \\
\hline
"""

# Add each row to the table
for dataset, density in sorted(DENSITIES.items(), key=lambda x: x[1], reverse=True):
    dataset = dataset.replace("&", "\&")
    latex_table += f"{dataset} & {density:.2e} \\\\ \\hline \n"

# End of the LaTeX table
latex_table += r"""\hline
\end{tabular}
\caption{Comparison between the density of various datasets.}
\label{tab:density-comparison}
\end{table}
"""

print(latex_table)

In [None]:
AMAZON_FASHION_DATASET = r"resources\other_datasets\Amazon_fashion\AMAZON_FASHION.json"

amazon_fashion_df = pd.read_json(AMAZON_FASHION_DATASET, lines=True)

In [None]:
num_articles = amazon_fashion_df["asin"].nunique()
num_customers = amazon_fashion_df["reviewerID"].nunique()
num_reviews = amazon_fashion_df.shape[0]
print(num_articles, num_customers, num_reviews)
amazon_fashion_density = density_of_transactions(num_reviews, num_customers, num_articles)
amazon_fashion_density

In [None]:
round(amazon_fashion_density, 7)

In [None]:
amazon_fashion_df

In [None]:
GOODREADS_DATASET = r"resources\other_datasets\Goodreads\goodreads_reviews_dedup.json"

goodreads_df = pd.read_json(GOODREADS_DATASET, lines=True)

In [None]:
num_users = goodreads_df["user_id"].nunique()
num_books = goodreads_df["book_id"].nunique()
num_reviews = goodreads_df.shape[0]
print(num_users, num_books, num_reviews)
goodreads_density = density_of_transactions(num_reviews, num_users, num_books)
goodreads_density

In [None]:
format(goodreads_density, ".7f")

In [None]:
BOOK_RENTAL_PATH = r"resources\other_datasets\book_rental_dataset\BX-Book-Ratings.csv"

book_rental_df = pd.read_csv(BOOK_RENTAL_PATH, sep=",", encoding="latin1")

In [None]:
num_users = book_rental_df["user_id"].nunique()
num_books = book_rental_df["isbn"].nunique()
num_reviews = book_rental_df.shape[0]

book_rental_density = density_of_transactions(num_reviews, num_users, num_books)
print(num_users, num_books, num_reviews)
print(book_rental_density)

In [None]:
book_rental_df = book_rental_df.drop_duplicates(subset=["user_id", "isbn"])

num_users = book_rental_df["user_id"].nunique()
num_books = book_rental_df["isbn"].nunique()
num_reviews = book_rental_df.shape[0]

book_rental_density = density_of_transactions(num_reviews, num_users, num_books)
print(num_users, num_books, num_reviews)
print(book_rental_density)