In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
from pylab import rcParams
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
rcParams['figure.figsize'] = 10, 6
display(HTML("<style>.container { width:95% !important; }</style>"))

In [25]:
df = pd.read_csv("data/tidy_anime.csv")
df.shape

(77911, 28)

In [26]:
desired_cols = ['animeID', 'title_english', 'type', 'source', 'producers', 'genre', 'studio',
               'episodes', 'premiered', 'rating', 'score', 'scored_by', 'rank', 'popularity',
               'members', 'favorites', 'synopsis']
truncated_df = df[desired_cols]
truncated_df.head()

Unnamed: 0,animeID,title_english,type,source,producers,genre,studio,episodes,premiered,rating,score,scored_by,rank,popularity,members,favorites,synopsis,background
0,1,Cowboy Bebop,TV,Original,Bandai Visual,Action,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...
1,1,Cowboy Bebop,TV,Original,Bandai Visual,Adventure,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...
2,1,Cowboy Bebop,TV,Original,Bandai Visual,Comedy,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...
3,1,Cowboy Bebop,TV,Original,Bandai Visual,Drama,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...
4,1,Cowboy Bebop,TV,Original,Bandai Visual,Sci-Fi,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...


In [45]:
# filter out bad titles. Only want titles that have an english name

orig_len = len(truncated_df)
filtered_df = truncated_df[truncated_df['title_english'].notnull()]
new_len = len(filtered_df)
print ("removed {} bad anime".format(orig_len - new_len))

removed 30430 bad anime


In [52]:
# currently the anime is duplicated, one row per genre per studio. We need to flatten all to one row

all_ids = set(filtered_df['animeID'].unique()) # 5.6K anime IDs
print ("{} unique anime".format(len(all_ids)))

id_genre_mapping = {}
for each_id in all_ids:
    id_genre_mapping[each_id] = list(filtered_df[truncated_df['animeID'] == each_id]['genre'])

id_studio_mapping = {}
for each_id in all_ids:
    id_studio_mapping[each_id] = list(filtered_df[truncated_df['animeID'] == each_id]['studio'])

5652 unique anime


  
  if sys.path[0] == '':


In [48]:
# viewing for sanity
filtered_df[filtered_df['animeID'] == 1]

Unnamed: 0,animeID,title_english,type,source,producers,genre,studio,episodes,premiered,rating,score,scored_by,rank,popularity,members,favorites,synopsis,background
0,1,Cowboy Bebop,TV,Original,Bandai Visual,Action,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...
1,1,Cowboy Bebop,TV,Original,Bandai Visual,Adventure,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...
2,1,Cowboy Bebop,TV,Original,Bandai Visual,Comedy,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...
3,1,Cowboy Bebop,TV,Original,Bandai Visual,Drama,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...
4,1,Cowboy Bebop,TV,Original,Bandai Visual,Sci-Fi,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...
5,1,Cowboy Bebop,TV,Original,Bandai Visual,Space,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...


In [49]:
# get distinct df, remove duplicates
filtered_df.groupby('animeID').head(1)

# add aggregated genres as categorical data


Unnamed: 0,animeID,title_english,type,source,producers,genre,studio,episodes,premiered,rating,score,scored_by,rank,popularity,members,favorites,synopsis,background
0,1,Cowboy Bebop,TV,Original,Bandai Visual,Action,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever...",When Cowboy Bebop first aired in spring of 199...
6,5,Cowboy Bebop: The Movie,Movie,Original,Sunrise,Action,Bones,1.0,,R - 17+ (violence & profanity),8.41,120243,164,449,197791,776,"Another day, another bounty—such is the life o...",
16,6,Trigun,TV,Manga,Victor Entertainment,Action,Madhouse,26.0,Spring 1998,PG-13 - Teens 13 or older,8.30,212537,255,146,408548,10432,"Vash the Stampede is the man with a $$60,000,0...",The Japanese release by Victor Entertainment h...
22,7,Witch Hunter Robin,TV,Original,Bandai Visual,Action,Sunrise,26.0,Summer 2002,PG-13 - Teens 13 or older,7.33,32837,2371,1171,79397,537,Witches are individuals with special powers li...,
28,8,Beet the Vandel Buster,TV,Manga,TV Tokyo,Adventure,Toei Animation,52.0,Fall 2004,PG - Children,7.03,4894,3544,3704,11708,14,It is the dark century and the people are suff...,
36,16,Honey and Clover,TV,Manga,Genco,Comedy,J.C.Staff,24.0,Spring 2005,PG-13 - Teens 13 or older,8.12,57065,419,536,172274,3752,"Yuuta, Takumi, and Shinobu share a six-tatami ...",Hachimitsu to Clover was the first anime to ai...
65,19,Monster,TV,Manga,VAP,Drama,Madhouse,74.0,Spring 2004,R+ - Mild Nudity,8.69,131233,49,156,394387,19188,"Dr. Kenzo Tenma, an elite neurosurgeon recentl...",
86,20,Naruto,TV,Manga,TV Tokyo,Action,Studio Pierrot,220.0,Fall 2002,PG-13 - Teens 13 or older,7.90,716412,705,10,1091313,39356,"Moments prior to Naruto Uzumaki's birth, a hug...",Naruto received numerous awards during its air...
104,21,One Piece,TV,Manga,Fuji TV,Action,Toei Animation,,Fall 1999,PG-13 - Teens 13 or older,8.53,465454,94,36,803871,76869,"Gol D. Roger was known as the ""Pirate King,"" t...",Several anime-original arcs have been adapted ...
125,22,The Prince of Tennis,TV,Manga,Production I.G,Action,Trans Arts,178.0,Fall 2001,PG-13 - Teens 13 or older,8.00,50306,577,856,113662,2885,The world of tennis is harsh and highly compet...,"Since 2003, there has been a bi-annual musical..."


In [61]:
all_genres = set([item for sublist in id_genre_mapping.values() for item in sublist])
