In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def generate_anime_catalog_partitions(no_partitions: int = 7):
    data_path = os.path.join(
        "s3",
        "anime-ds-project",
        "raw_data",
        "anime_catalog",
    )

    df_anime = pd.read_csv(
        os.path.join(
            data_path,
            "anime.csv",
        )
    )

    df_synopsis = pd.read_csv(
        os.path.join(
            data_path,
            "anime_with_synopsis.csv",
        )
    )

    df_anime_acum = pd.DataFrame()
    df_anime_partitions_list = np.array_split(df_anime, no_partitions)
    for i in range(no_partitions):
        df_anime_acum = pd.concat([df_anime_acum, df_anime_partitions_list[i]])
        df_anime_acum.to_csv(
            os.path.join(
                data_path,
                f"anime_2024-05-0{i+1}.csv",
            ),
            index=False,
        )

        anime_ids = df_anime_acum["anime_id"].values.tolist()
        df_synopsis_filtered = df_synopsis[df_synopsis["MAL_ID"].apply(lambda x: x in anime_ids)]
        df_synopsis_filtered.to_csv(
            os.path.join(
                data_path,
                f"anime_with_synopsis_2024-05-0{i+1}.csv",
            ),
            index=False,
        )

In [3]:
def generate_anime_ratings_partitions(no_partitions: int = 7):
    catalog_data_path = os.path.join(
        "s3",
        "anime-ds-project",
        "raw_data",
        "anime_catalog",
    )
    ratings_data_path = os.path.join(
        "s3",
        "anime-ds-project",
        "raw_data",
        "anime_ratings",
    )
    df_ratings = pd.read_csv(
        os.path.join(
            ratings_data_path,
            "rating.csv",
        )
    ).reset_index(
        inplace=False,
        names="rating_id"
    )
    partition_lenght = round(len(df_ratings) / no_partitions)
    df_anime = pd.DataFrame()
    for i in range(no_partitions):
        print(i+1)
        print(len(df_ratings))
        df_anime = pd.read_csv(
            os.path.join(
                catalog_data_path,
                f"anime_2024-05-0{i+1}.csv",
            )
        )[["anime_id"]]
        df_anime["Flag"] = True

        df_ratings_filtered = df_ratings.merge(df_anime, on="anime_id", how="left")
        df_ratings_filtered = df_ratings_filtered[df_ratings_filtered["Flag"] == True]

        if len(df_ratings_filtered) > partition_lenght:
            df_ratings_sample = df_ratings_filtered.sample(partition_lenght)
        else:
            df_ratings_sample = df_ratings_filtered

        print(len(df_ratings_filtered))
        print(len(df_ratings_sample))
        
        df_ratings_sample[["user_id", "anime_id", "rating"]].to_csv(
            os.path.join(
                ratings_data_path,
                f"ratings_2024-05-0{i+1}.csv",
            ),
            index=False,
        )
        df_ratings = df_ratings.merge(df_ratings_sample[["rating_id", "Flag"]], on="rating_id", how="left")
        df_ratings = df_ratings[pd.isna(df_ratings["Flag"])].drop(columns=["Flag"])

In [4]:
no_partitions=7
generate_anime_catalog_partitions(no_partitions)
generate_anime_ratings_partitions(no_partitions)

1
7813737
5018996
1116248
2
6697489
5552940
1116248
3
5581241
5116102
1116248
4
4464993
4288913
1116248
5
3348745
3221986
1116248
6
2232497
2108135
1116248
7
1116249
1116239
1116239
