### Import packages


In [1]:
import pandas as pd
import json

### Handle Anime Data


In [2]:
anime_df = pd.read_json("../dataset/anime.json")

In [3]:
anime_df

Unnamed: 0,_id,malId,kitsuId,title,en,enjp,jp,slug,poster,type,...,endDate,rating,episodeCount,studio,score,popularity,likes,recommended,runtime,source
0,1,1.0,1.0,Cowboy Bebop,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,Cowboy_Bebop,https://media.kitsu.io/anime/poster_images/1/o...,TV,...,1999-04-24,R - 17+ Violence & Profanity,26.0,[Sunrise],87.5,1832322.0,81086.0,"[{'_id': 874, 'title': 'Uchuu Kaizoku Captain ...",,Original
1,2,5.0,2.0,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,Cowboy Bebop: Tengoku no Tobira,カウボーイビバップ 天国の扉,Cowboy_Bebop__Tengoku_no_Tobira,https://media.kitsu.io/anime/poster_images/2/o...,Movie,...,2001-09-01,R - 17+ Violence & Profanity,1.0,[Bones],83.8,371915.0,1545.0,"[{'_id': 4726, 'title': 'Code Geass: Boukoku n...",1 Hour 55 Mins,Original
2,3,6.0,3.0,Trigun,Trigun,Trigun,トライガン,Trigun,https://media.kitsu.io/anime/3/poster_image/d4...,TV,...,1998-09-30,PG-13 - Teens 13 or older,26.0,[Madhouse],82.2,750793.0,15706.0,"[{'_id': 8299, 'title': 'God Eater', 'slug': '...",,Manga
3,4,7.0,4.0,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),Witch_Hunter_Robin,https://media.kitsu.io/anime/poster_images/4/o...,TV,...,2002-12-25,PG-13 - Teens 13 or older,26.0,[Sunrise],72.4,115033.0,638.0,"[{'_id': 13141, 'title': 'Mahouka Koukou no Re...",,Original
4,5,8.0,5.0,Bouken Ou Beet,Beet the Vandel Buster,Bouken Ou Beet,冒険王ビィト,Bouken_Ou_Beet,https://media.kitsu.io/anime/poster_images/5/o...,TV,...,2005-09-29,PG-13 - Teens 13 or older,52.0,[Toei Animation],69.3,15300.0,15.0,"[{'_id': 13947, 'title': 'Kimetsu no Yaiba: Yu...",,Manga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24071,24087,57799.0,,Bai Bian Xiao Ba 12th Season,,Bai Bian Xiao Ba 12th Season,百变校巴 第12季,Bai_Bian_Xiao_Ba_12th_Season,https://cdn.myanimelist.net/images/anime/1226/...,TV,...,,,26.0,,,15.0,,[],,Original
24072,24088,57801.0,,Bai She 3: Fusheng,,Bai She 3: Fusheng,白蛇3：浮生,Bai_She_3__Fusheng,https://cdn.myanimelist.net/images/anime/1448/...,Movie,...,,,1.0,,,346.0,,[],,Other
24073,24089,57810.0,,Shoushimin Series,,Shoushimin Series,小市民シリーズ,Shoushimin_Series,https://cdn.myanimelist.net/images/anime/1697/...,TV,...,,,,[Lapin Track],,1937.0,,[],,Novel
24074,24090,57820.0,,Kamitsubaki-shi Kensetsuchuu.,,Kamitsubaki-shi Kensetsuchuu.,神椿市建設中.,Kamitsubaki-shi_Kensetsuchuu,https://cdn.myanimelist.net/images/anime/1653/...,TV,...,,,,,,1.0,,[],,Other


Remove rows without malId for consistency.


In [4]:
anime_df.dropna(axis=0, subset="malId", inplace=True)
anime_df.drop_duplicates(subset="malId", inplace=True)

Remove unnecessary columns.


In [5]:
anime_df.drop(columns=["slug", "recommended", "runtime"], inplace=True)

### Handle Anime Tags


In [6]:
with open("../dataset/animetags.json", "r", encoding="utf-8") as file:
    animetags = json.load(file)
    animetags = animetags["data"]

Get the list of valid tags. Adult tags, tags with typo, unnecessary tags, etc are removed from this list.


In [7]:
valid_tags = set(open("validTags.txt").read().split(","))

A function that gets the mal id from a list of source links.


In [8]:
def get_mal_id(links):
    for link in links:
        if "myanimelist" in link:
            return link.rsplit("/", 1)[-1]

A function to create a map of anime id and its respective tags.


In [9]:
def extract_tags(data):
    tags = {}
    for anime in data:
        mal_id = get_mal_id(anime["sources"])
        if mal_id:
            tags[mal_id] = ", ".join([x for x in anime["tags"] if x in valid_tags])

    return tags

In [10]:
tags = extract_tags(animetags)

Create a new dataframe for the tags.


In [11]:
tags_df = pd.DataFrame(tags.items(), columns=["malId", "tags"])
tags_df.malId = tags_df.malId.astype(float)

### Prepare Final Dataset


Merge the anime data with tags.


In [12]:
df = pd.merge(anime_df, tags_df, on="malId", how="inner")

In [13]:
df

Unnamed: 0,_id,malId,kitsuId,title,en,enjp,jp,poster,type,synopsis,...,startDate,endDate,rating,episodeCount,studio,score,popularity,likes,source,tags_y
0,1,1.0,1.0,Cowboy Bebop,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,https://media.kitsu.io/anime/poster_images/1/o...,TV,"Crime is timeless. By the year 2071, humanity ...",...,1998-04-03,1999-04-24,R - 17+ Violence & Profanity,26.0,[Sunrise],87.5,1832322.0,81086.0,Original,"action, adult cast, adventure, amnesia, anti-h..."
1,2,5.0,2.0,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,Cowboy Bebop: Tengoku no Tobira,カウボーイビバップ 天国の扉,https://media.kitsu.io/anime/poster_images/2/o...,Movie,"Another day, another bounty—such is the life o...",...,2001-09-01,2001-09-01,R - 17+ Violence & Profanity,1.0,[Bones],83.8,371915.0,1545.0,Original,"action, adult cast, adventure, amnesia, anti-h..."
2,3,6.0,3.0,Trigun,Trigun,Trigun,トライガン,https://media.kitsu.io/anime/3/poster_image/d4...,TV,"Vash the Stampede is the man with a $$60,000,0...",...,1998-04-01,1998-09-30,PG-13 - Teens 13 or older,26.0,[Madhouse],82.2,750793.0,15706.0,Manga,"action, adult cast, adventure, alien, aliens, ..."
3,4,7.0,4.0,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),https://media.kitsu.io/anime/poster_images/4/o...,TV,Robin Sena is a powerful craft user drafted in...,...,2002-07-03,2002-12-25,PG-13 - Teens 13 or older,26.0,[Sunrise],72.4,115033.0,638.0,Original,"action, asia, bionic powers, coming of age, co..."
4,5,8.0,5.0,Bouken Ou Beet,Beet the Vandel Buster,Bouken Ou Beet,冒険王ビィト,https://media.kitsu.io/anime/poster_images/5/o...,TV,It is the dark century and the people are suff...,...,2004-09-30,2005-09-29,PG-13 - Teens 13 or older,52.0,[Toei Animation],69.3,15300.0,15.0,Manga,"action, adventure, alternative world, based on..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21418,24074,57478.0,,Kuramerukagari,,Kuramerukagari,くらめるかがり,https://cdn.myanimelist.net/images/anime/1716/...,Movie,This is a story that weaves together people an...,...,2024-04-12,,,1.0,,,22.0,,Original,"detective, fantasy, mystery, original work, sc..."
21419,24075,57538.0,,Sai-Kyo-Oh! Zukan: The Ultimate Battles,,Sai-Kyo-Oh! Zukan: The Ultimate Battles,最強王図鑑 ～The Ultimate Battles～,https://cdn.myanimelist.net/images/anime/1229/...,TV,,...,2024-01-06,,,,[OLM Digital],,71.0,,Book,action
21420,24076,57677.0,,Bibliomania,,Bibliomania,BIBLIOMANIA,https://cdn.myanimelist.net/images/anime/1019/...,Movie,,...,2013-04-06,,,1.0,,,31.0,1.0,Original,"alternate universe, avant garde, cosmic horror..."
21421,24078,57702.0,,Zaiyuki Pilot,,Zaiyuki Pilot,ザイユウキ,https://cdn.myanimelist.net/images/anime/1167/...,ONA,Pilot version of the upcoming Zaiyuki series r...,...,2023-12-30,,,1.0,[Village Studio],,36.0,,Original,"action, adventure, fantasy, supernatural"


#### Format the columns


Convert column types


In [14]:
df["malId"] = df["malId"].astype(pd.Int64Dtype(), errors="ignore")
df["episodeCount"] = pd.to_numeric(df["episodeCount"], errors="coerce").astype(
    pd.Int64Dtype()
)
df["popularity"] = pd.to_numeric(df["popularity"], errors="coerce").astype(
    pd.Int64Dtype()
)

string_columns = [
    "title",
    "en",
    "enjp",
    "jp",
    "poster",
    "type",
    "synopsis",
    "season",
    "startDate",
    "endDate",
    "rating",
    "status",
    "source",
]
df[string_columns] = df[string_columns].astype(str)

Drop unnecessary columns


In [15]:
df.drop(["kitsuId", "tags_x"], axis=1, inplace=True)

Convert array values to string


In [16]:
df["genres"] = df["genres"].apply(lambda x: x and ", ".join(list(x)))
df["studio"] = df["studio"].apply(lambda x: x and ", ".join(list(x)))

Rename tags column


In [17]:
df.rename(columns={"tags_y": "tags"}, inplace=True)

In [18]:
df

Unnamed: 0,_id,malId,title,en,enjp,jp,poster,type,synopsis,status,...,startDate,endDate,rating,episodeCount,studio,score,popularity,likes,source,tags
0,1,1,Cowboy Bebop,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,https://media.kitsu.io/anime/poster_images/1/o...,TV,"Crime is timeless. By the year 2071, humanity ...",Finished,...,1998-04-03,1999-04-24,R - 17+ Violence & Profanity,26,Sunrise,87.5,1832322,81086.0,Original,"action, adult cast, adventure, amnesia, anti-h..."
1,2,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,Cowboy Bebop: Tengoku no Tobira,カウボーイビバップ 天国の扉,https://media.kitsu.io/anime/poster_images/2/o...,Movie,"Another day, another bounty—such is the life o...",Finished,...,2001-09-01,2001-09-01,R - 17+ Violence & Profanity,1,Bones,83.8,371915,1545.0,Original,"action, adult cast, adventure, amnesia, anti-h..."
2,3,6,Trigun,Trigun,Trigun,トライガン,https://media.kitsu.io/anime/3/poster_image/d4...,TV,"Vash the Stampede is the man with a $$60,000,0...",Finished,...,1998-04-01,1998-09-30,PG-13 - Teens 13 or older,26,Madhouse,82.2,750793,15706.0,Manga,"action, adult cast, adventure, alien, aliens, ..."
3,4,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),https://media.kitsu.io/anime/poster_images/4/o...,TV,Robin Sena is a powerful craft user drafted in...,Finished,...,2002-07-03,2002-12-25,PG-13 - Teens 13 or older,26,Sunrise,72.4,115033,638.0,Original,"action, asia, bionic powers, coming of age, co..."
4,5,8,Bouken Ou Beet,Beet the Vandel Buster,Bouken Ou Beet,冒険王ビィト,https://media.kitsu.io/anime/poster_images/5/o...,TV,It is the dark century and the people are suff...,Finished,...,2004-09-30,2005-09-29,PG-13 - Teens 13 or older,52,Toei Animation,69.3,15300,15.0,Manga,"action, adventure, alternative world, based on..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21418,24074,57478,Kuramerukagari,,Kuramerukagari,くらめるかがり,https://cdn.myanimelist.net/images/anime/1716/...,Movie,This is a story that weaves together people an...,Not Yet Aired,...,2024-04-12,,,1,,,22,,Original,"detective, fantasy, mystery, original work, sc..."
21419,24075,57538,Sai-Kyo-Oh! Zukan: The Ultimate Battles,,Sai-Kyo-Oh! Zukan: The Ultimate Battles,最強王図鑑 ～The Ultimate Battles～,https://cdn.myanimelist.net/images/anime/1229/...,TV,,Airing,...,2024-01-06,,,,OLM Digital,,71,,Book,action
21420,24076,57677,Bibliomania,,Bibliomania,BIBLIOMANIA,https://cdn.myanimelist.net/images/anime/1019/...,Movie,,Finished,...,2013-04-06,,,1,,,31,1.0,Original,"alternate universe, avant garde, cosmic horror..."
21421,24078,57702,Zaiyuki Pilot,,Zaiyuki Pilot,ザイユウキ,https://cdn.myanimelist.net/images/anime/1167/...,ONA,Pilot version of the upcoming Zaiyuki series r...,Finished,...,2023-12-30,,,1,Village Studio,,36,,Original,"action, adventure, fantasy, supernatural"


### Save dataset


In [19]:
df.to_json("data.json", orient="records")

### Extra Code


Get the list of unique tags from the tags dataset


In [20]:
# all_tags = [tag.strip() for tags in merged_df["tags_y"] for tag in tags.split(",")]

# # Get unique tags
# unique_tags = list(set(all_tags))

# # Write unique tags to a text file
# with open("unique_tags.txt", "w") as file:
#     file.write(",".join(unique_tags))