In [1]:
import json
import pandas as pd

anime_themes_json = json.load(open("./raw_data/animethemes_data.json", "r"))
my_anime_list = json.load(open("./raw_data/myanimelist_data.json", "r"))
df_my_anime_list = pd.DataFrame(my_anime_list)


In [2]:
animes_parsed = []
anime_themes_parsed = []
for anime in anime_themes_json:
    anime = anime.copy()
    if "animethemes" in anime.keys():
        for anime_theme in anime["animethemes"]:
            if len(anime_theme["animethemeentries"]) > 0:
                at = {"type": anime_theme["type"], "id_theme": anime_theme["id"]}
                at["id_anime"] = anime["id"]
                if "song" in anime_theme.keys() and anime_theme["song"] is not None:
                    at["song"] = anime_theme["song"]["title"]
                anime_theme_entry = anime_theme["animethemeentries"][0]
                at["spoiler"] = anime_theme_entry["spoiler"]
                at["nsfw"] = anime_theme_entry["nsfw"]

                if (
                    "videos" in anime_theme_entry.keys()
                    and len(anime_theme_entry["videos"]) > 0
                ):
                    video = anime_theme_entry["videos"][0]
                    at["video_link"] = video["link"]
                    at["video_resolution"] = video["resolution"]

                anime_themes_parsed.append(at)

        anime.pop("animethemes")

    if "animesynonyms" in anime.keys():
        anime["synonyms"] = []
        for anime_synonym in anime["animesynonyms"]:
            anime["synonyms"].append(anime_synonym["text"])

        anime.pop("animesynonyms")

    anime["image"] = anime["images"][0]["link"]
    anime.pop("images")
    anime["id_anime"] = anime.pop("id")

    animes_parsed.append(anime)

In [3]:
from fuzzywuzzy import fuzz

my_anime_list_df_test = (
    pd.DataFrame(my_anime_list)
    .sort_values("popularity_score", ascending=False)
    .head(500)
)


def max_ration_index(x, col):
    ratios = []
    for i in col:
        ratios.append(fuzz.ratio(x, i))

    return ratios.index(max(ratios))


def fuzzy_merge(row, df2):
    index = max_ration_index(row["title_parsed"], df2["title_parsed"])
    df2_row = df2.iloc[index]
    df2_row = df2_row.rename({"title_parsed": "title_matched", "title": "title_at"})

    row = row.rename({"title": "title_mal"})

    return pd.concat([row, df2_row], axis=0)


my_anime_list_df_test["title_parsed"] = my_anime_list_df_test["title"].str.lower()

anime_theme_df_test = pd.DataFrame(animes_parsed)
anime_theme_df_test = anime_theme_df_test.rename(columns={"name": "title"})
anime_theme_df_test["title_parsed"] = anime_theme_df_test["title"].str.lower()

# view final DataFrame
df_merged = my_anime_list_df_test.merge(
    anime_theme_df_test, on="title_parsed", suffixes=("_mal", "_at")
)
df_merged["title_matched"] = df_merged["title_parsed"]

not_merged = my_anime_list_df_test[
    ~my_anime_list_df_test["title_parsed"].isin(df_merged["title_parsed"])
]

fix_merge = not_merged.apply(lambda x: fuzzy_merge(x, anime_theme_df_test), axis=1)

df_merged = pd.concat([df_merged, fix_merge])
df_anime_merged = df_merged.drop_duplicates(subset=["id_anime"], keep="first")

df_anime_themes = pd.DataFrame(anime_themes_parsed)
df_anime_themes = df_anime_themes[
    df_anime_themes["id_anime"].isin(df_anime_merged["id_anime"])
]

op_counts = (
    df_anime_themes[df_anime_themes["type"] == "OP"]
    .value_counts(subset=["id_anime"])
    .rename("op_counts")
)
ed_counts = (
    df_anime_themes[df_anime_themes["type"] == "ED"]
    .value_counts(subset=["id_anime"])
    .rename("ed_counts")
)

df_anime_merged = df_anime_merged.merge(op_counts, on="id_anime")
df_anime_merged = df_anime_merged.merge(ed_counts, on="id_anime")
df_anime_merged["total_counts"] = (
    df_anime_merged["op_counts"] + df_anime_merged["ed_counts"]
)


In [5]:
def select_animethemes(
    df_anime_themes,
    theme_type="OP",
    min_days_diff=5,
    max_days_diff=240,
    maximum_days=1080,
):
    anime_themes = df_anime_themes.copy()
    anime_themes = anime_themes[anime_themes["type"] == theme_type]
    anime_themes["used"] = False

    days = []
    for day in range(maximum_days):
        if day == 0:
            not_valid_animes = []
        else:
            not_valid_animes = days[-min(min_days_diff, len(days)) :]

        animes_themes_to_sample = anime_themes[
            ~anime_themes["id_anime"].isin(not_valid_animes) & ~anime_themes["used"]
        ]

        sampled_row = animes_themes_to_sample.sample(1)

        anime_themes.loc[sampled_row.index, "used"] = True
        days.append(sampled_row["id_theme"].values[0])

        if day > max_days_diff:
            id_theme = days[-max_days_diff]
            anime_themes.loc[anime_themes["id_theme"] == id_theme, "used"] = False

    return days


df_anime_merged["hardcore"] = df_anime_merged["rank"] >= 200

df_anime_easy_merged = df_anime_merged[~df_anime_merged["hardcore"]]
df_anime_hardcore_merged = df_anime_merged[df_anime_merged["hardcore"]]

df_anime_themes_easy = df_anime_themes[
    df_anime_themes["id_anime"].isin(df_anime_easy_merged["id_anime"])
]

df_anime_themes_hardcore = df_anime_themes[
    df_anime_themes["id_anime"].isin(df_anime_hardcore_merged["id_anime"])
]

sampled_easy_openings = select_animethemes(
    df_anime_themes_easy, theme_type="OP", min_days_diff=10
)
sampled_easy_endings = select_animethemes(
    df_anime_themes_easy, theme_type="ED", min_days_diff=10
)

sampled_hardcore_openings = select_animethemes(
    df_anime_themes_hardcore, theme_type="OP", min_days_diff=10
)
sampled_hardcore_endings = select_animethemes(
    df_anime_themes_hardcore, theme_type="ED", min_days_diff=10
)

print("Easy Openings: ", len(sampled_easy_openings))
print("Hardcore Openings: ", len(sampled_hardcore_openings))

print("Easy Endings: ", len(sampled_easy_endings))
print("Hardcore Endings: ", len(sampled_hardcore_endings))


Easy Openings:  1080
Hardcore Openings:  1080
Easy Endings:  1080
Hardcore Endings:  1080


In [6]:
dict_days = {
    "easy_openings": sampled_easy_openings,
    "hardcore_openings": sampled_hardcore_openings,
    "easy_endings": sampled_easy_endings,
    "hardcore_endings": sampled_hardcore_endings,
}

df_days = pd.DataFrame(dict_days)

In [14]:
df_anime_merged = df_anime_merged.set_index("id_anime")
df_anime_merged

Unnamed: 0_level_0,rank,title_mal,popularity_score,quality_score,title_parsed,title_at,slug,year,season,synopsis,created_at,updated_at,deleted_at,synonyms,image,title_matched,op_counts,ed_counts,total_counts,hardcore
id_anime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2611,1,Shingeki no Kyojin,3791790.0,8.54,shingeki no kyojin,Shingeki no Kyojin,shingeki_no_kyojin,2013,Spring,"Several hundred years ago, humans were nearly ...",2021-03-27T00:44:34.670875Z,2021-03-28T07:49:03.610544Z,,[Attack on Titan],https://pub-92474f7785774e91a790e086dfa6b2ef.r...,shingeki no kyojin,2,2,4,False
579,2,Death Note,3759762.0,8.62,death note,Death Note,death_note,2006,Fall,Yagami Light is a 17-year-old genius from Japa...,2021-03-27T00:43:40.010882Z,2021-03-28T01:23:37.354349Z,,[],https://pub-92474f7785774e91a790e086dfa6b2ef.r...,death note,2,3,5,False
797,3,Fullmetal Alchemist: Brotherhood,3216314.0,9.10,fullmetal alchemist: brotherhood,Fullmetal Alchemist: Brotherhood,fullmetal_alchemist_brotherhood,2009,Spring,"""In order for something to be obtained, someth...",2021-03-27T00:43:45.566253Z,2021-03-28T02:02:58.096573Z,,"[Hagane no Renkinjutsushi (2009), Fullmetal Al...",https://pub-92474f7785774e91a790e086dfa6b2ef.r...,fullmetal alchemist: brotherhood,5,5,10,False
2146,4,One Punch Man,3097547.0,8.50,one punch man,One Punch Man,one_punch_man,2015,Fall,"Saitama has a rather peculiar hobby, being a s...",2021-03-27T00:44:22.689167Z,2021-03-28T06:08:45.122855Z,,"[One Punch-Man, One-Punch Man, OPM]",https://pub-92474f7785774e91a790e086dfa6b2ef.r...,one punch man,1,2,3,False
2801,5,Sword Art Online,2981265.0,7.20,sword art online,Sword Art Online,sword_art_online,2012,Summer,"In the near future, a Virtual Reality Massive ...",2021-03-27T00:44:39.801318Z,2021-03-28T08:23:43.886796Z,,"[S.A.O, SAO]",https://pub-92474f7785774e91a790e086dfa6b2ef.r...,sword art online,2,2,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1169,423,Hinamatsuri (TV),493909.0,8.14,hinamatsuri (tv),Hinamatsuri,hinamatsuri,2018,Spring,The comedy manga centers around a super-powere...,2021-03-27T00:43:54.938024Z,2021-03-28T03:10:45.462498Z,,[Hina Festival],https://pub-92474f7785774e91a790e086dfa6b2ef.r...,hinamatsuri,1,3,4,True
3774,458,Summertime Render,464937.0,8.50,summertime render,Summer Time Render,summer_time_render,2022,Spring,A tragic incident calls a young man named Shin...,2022-04-14T16:35:52.930749Z,2022-04-14T16:35:52.930749Z,,[Summertime Rendering],https://pub-92474f7785774e91a790e086dfa6b2ef.r...,summer time render,2,3,5,True
827,488,Gakkougurashi!,440517.0,7.62,gakkougurashi!,Gakkou Gurashi!,gakkou_gurashi,2015,Summer,Why would anyone form a School Living Club? Co...,2021-03-27T00:43:46.295320Z,2021-03-28T02:07:57.534986Z,,[School-Live!],https://pub-92474f7785774e91a790e086dfa6b2ef.r...,gakkou gurashi!,1,4,5,True
1741,498,Made in Abyss Movie 3: Fukaki Tamashii no Reimei,433425.0,8.62,made in abyss movie 3: fukaki tamashii no reimei,Made in Abyss Movie: Fukaki Tamashii no Reimei,made_in_abyss_movie_fukaki_tamashii_no_reimei,2020,Winter,<i>Dawn of the Deep Soul</i> continues the epi...,2021-03-27T00:44:11.092340Z,2021-03-28T04:54:40.424685Z,,[Gekijouban Made in Abyss: Fukaki Tamashii no ...,https://pub-92474f7785774e91a790e086dfa6b2ef.r...,made in abyss movie: fukaki tamashii no reimei,1,1,2,True


In [15]:
df_anime_themes = df_anime_themes.set_index("id_theme")
df_anime_themes


Unnamed: 0_level_0,type,id_anime,song,spoiler,nsfw,video_link,video_resolution
id_theme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6549,OP,16,Answer,False,False,https://v.animethemes.moe/SangatsuNoLion-OP1.webm,1080.0
6550,OP,16,Sayonara Bystander,False,False,https://v.animethemes.moe/SangatsuNoLion-OP2.webm,1080.0
6551,ED,16,Fighter,False,False,https://v.animethemes.moe/SangatsuNoLion-ED1.webm,1080.0
6552,ED,16,Nyaa Shougi Ondo,False,False,https://v.animethemes.moe/SangatsuNoLion-ED2.webm,1080.0
6553,ED,16,orion,False,False,https://v.animethemes.moe/SangatsuNoLion-ED3.webm,1080.0
...,...,...,...,...,...,...,...
12293,OP,4065,Idol,False,False,https://v.animethemes.moe/OshiNoKo-OP1.webm,720.0
12294,ED,4065,Mephisto,False,False,https://v.animethemes.moe/OshiNoKo-ED1.webm,720.0
12284,OP,4067,Dokimeki Diary,False,False,https://v.animethemes.moe/Pokemon2023-OP1.webm,720.0
12372,OP,4102,Ao no Sumika,False,False,https://v.animethemes.moe/JujutsuKaisenS2-OP1....,720.0


In [16]:
df_days.index.name = "id_day"
df_days


Unnamed: 0_level_0,easy_openings,hardcore_openings,easy_endings,hardcore_endings
id_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2880,7103,7235,2801
1,1928,3235,1830,6187
2,5493,727,9580,3978
3,994,685,1817,8151
4,5293,4393,9791,8150
...,...,...,...,...
1075,7647,1153,1811,5365
1076,3887,10763,8084,3342
1077,5695,5507,2016,7078
1078,9787,5114,6540,3346


In [17]:
df_days.reset_index()

Unnamed: 0,id_day,easy_openings,hardcore_openings,easy_endings,hardcore_endings
0,0,2880,7103,7235,2801
1,1,1928,3235,1830,6187
2,2,5493,727,9580,3978
3,3,994,685,1817,8151
4,4,5293,4393,9791,8150
...,...,...,...,...,...
1075,1075,7647,1153,1811,5365
1076,1076,3887,10763,8084,3342
1077,1077,5695,5507,2016,7078
1078,1078,9787,5114,6540,3346


In [20]:
import pandas as pd


df_anime = pd.read_csv("./parsed_data/animes.csv")
print(len(df_anime), df_anime["id_anime"].nunique())

495 495


In [23]:
df_theme = pd.read_csv("./parsed_data/themes.csv")
print(len(df_theme), df_theme["id_theme"].nunique())
print(df_theme["id_anime"].isin(df_anime["id_anime"]).sum())

2090 2090
2090


In [22]:
df_day = pd.read_csv("./parsed_data/days.csv")
print(len(df_day), df_day["id_day"].nunique())
print(
    df_day["easy_openings"].isin(df_theme["id_theme"]).sum(),
    df_day["hardcore_openings"].isin(df_theme["id_theme"]).sum(),
    df_day["easy_endings"].isin(df_theme["id_theme"]).sum(),
    df_day["hardcore_endings"].isin(df_theme["id_theme"]).sum(),
)

1080 1080
1080 1080 1080 1080
