In [1]:
# Uncomment to download

# import kagglehub
# path = kagglehub.dataset_download("marlesson/myanimelist-dataset-animes-profiles-reviews")
# print("Path to dataset files:", path)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [3]:

animes_df = pd.read_csv("original_data/animes.csv")
profiles_df = pd.read_csv("original_data/profiles.csv")
reviews_df = pd.read_csv("original_data/reviews.csv")

In [4]:
# keep uid and title columns, change uid to a_id
cleared_anime_df = animes_df[["uid", "title"]]
cleared_anime_df.rename(columns={"uid": "a_id"}, inplace=True)

# keep profile column, make a new column named u_id, make profile unique, keep u_id as index
cleared_profile_df = profiles_df[["profile"]]
cleared_profile_df = cleared_profile_df.drop_duplicates(subset='profile')
cleared_profile_df['u_id'] = range(0, len(cleared_profile_df))


# keep profile anime_uid and score columns, change anime_uid to a_id
cleared_reviews_df = reviews_df[["profile", "anime_uid", "score"]]
cleared_reviews_df.rename(columns={"anime_uid": "a_id"}, inplace=True)

In [5]:
# merge cleared_reviews_df with cleared_profile_df using profile column
merged_review_profile_df = cleared_reviews_df.merge(cleared_profile_df, on="profile")    

# merge cleared_anime_df with merged_review_profile_df using a_id column
merged_review_profile_anime_df = merged_review_profile_df.merge(cleared_anime_df, on="a_id")

# at some point, we create duplicates. Removes duplicates
merged_review_profile_anime_df = merged_review_profile_anime_df.drop_duplicates()

In [6]:
# save anime id and genre for reco
anime_id_to_genre = animes_df[["uid", "genre"]]
anime_id_to_genre.rename(columns={"uid": "a_id"}, inplace=True) 
anime_id_to_genre.to_csv("data/anime_id_to_genre.csv")

In [7]:
# reorder columns
final_df = merged_review_profile_anime_df[["u_id", "a_id", "profile", "title", "score"]]

# create id, and set it as index
final_df["id"] = range(0, len(final_df))
final_df.set_index("id", inplace=True)

# remove index name
final_df.index.name = None

# save the final data
final_df.to_csv("data/final_data.csv")

In [8]:
def reduceMatrixSize(df, u_review_count, a_review_count):
    
    # only keep the users with at least u_review_count reviews
    df = df[df.groupby('u_id')['u_id'].transform('count') >= u_review_count]

    # only keep the animes with at least a_review_count reviews
    df = df[df.groupby('a_id')['a_id'].transform('count') >= a_review_count]
    print(f"Reduced matrix size to {df["u_id"].nunique()} users and {df["a_id"].nunique()} animes")
    return df

In [9]:
new_df = reduceMatrixSize(final_df, 50, 14) # 100x100
new_df.to_csv("data/100x100.csv")

Reduced matrix size to 144 users and 131 animes


In [10]:
new_df = reduceMatrixSize(final_df, 100, 8) # 10x10
new_df.to_csv("data/10x10.csv")

Reduced matrix size to 39 users and 87 animes


In [11]:
final_df = pd.read_csv("data/final_data.csv")
hundred_df = pd.read_csv("data/100x100.csv")

result = final_df.merge(hundred_df, how="left", indicator=True)
result = result[result['_merge'] == 'left_only'].drop('_merge', axis=1)

second_hundred_df = reduceMatrixSize(result, 44, 10) # 100x100_2 
second_hundred_df.to_csv("data/100x100_2.csv")

Reduced matrix size to 145 users and 138 animes
