In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import our dependencies
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import tensorflow as tf
from path import Path
import numpy as np

In [3]:
# import anime csv and create df
file_path = Path("./data/anime.csv")
anime_df = pd.read_csv(file_path)
anime_df.head()

Unnamed: 0,anime_id,title,type,score,scored_by,status,episodes,start_date,end_date,source,...,producers,licensors,synopsis,background,main_picture,url,trailer_url,title_english,title_japanese,title_synonyms
0,5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,finished_airing,64.0,2009-04-05,2010-07-04,manga,...,"['Aniplex', 'Square Enix', 'Mainichi Broadcast...","['Funimation', 'Aniplex of America']",After a horrific alchemy experiment goes wrong...,,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...,https://www.youtube.com/watch?v=--IcmZkvL0Q,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,['Hagane no Renkinjutsushi: Fullmetal Alchemis...
1,11061,Hunter x Hunter (2011),tv,9.04,1509622,finished_airing,148.0,2011-10-02,2014-09-24,manga,...,"['VAP', 'Nippon Television Network', 'Shueisha']",['VIZ Media'],Hunters devote themselves to accomplishing haz...,,https://cdn.myanimelist.net/images/anime/1337/...,https://myanimelist.net/anime/11061/Hunter_x_H...,https://www.youtube.com/watch?v=D9iTQRB4XRk,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,['HxH (2011)']
2,38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,finished_airing,10.0,2019-04-29,2019-07-01,manga,...,"['Production I.G', 'Dentsu', 'Mainichi Broadca...",['Funimation'],Seeking to restore humanity's diminishing hope...,Shingeki no Kyojin adapts content from volumes...,https://cdn.myanimelist.net/images/anime/1517/...,https://myanimelist.net/anime/38524/Shingeki_n...,https://www.youtube.com/watch?v=hKHepjfj5Tw,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,[]
3,9253,Steins;Gate,tv,9.08,1252286,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,...,"['Frontier Works', 'Media Factory', 'Movic', '...",['Funimation'],Eccentric scientist Rintarou Okabe has a never...,Steins;Gate is based on 5pb. and Nitroplus' re...,https://cdn.myanimelist.net/images/anime/5/731...,https://myanimelist.net/anime/9253/Steins_Gate,https://www.youtube.com/watch?v=27OZc-ku6is,Steins;Gate,STEINS;GATE,[]
4,28851,Koe no Katachi,movie,8.95,1398608,finished_airing,1.0,2016-09-17,2016-09-17,manga,...,"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","['Eleven Arts', 'NYAV Post']","As a wild youth, elementary school student Sho...",Winner of the Excellence Award on the 20th Jap...,https://cdn.myanimelist.net/images/anime/1122/...,https://myanimelist.net/anime/28851/Koe_no_Kat...,https://www.youtube.com/watch?v=XBNWo25izJ8,A Silent Voice,聲の形,['The Shape of Voice']


In [4]:
# inspect columns and dtypes
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24165 entries, 0 to 24164
Data columns (total 39 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   anime_id          24165 non-null  int64  
 1   title             24165 non-null  object 
 2   type              24092 non-null  object 
 3   score             14272 non-null  float64
 4   scored_by         24165 non-null  int64  
 5   status            24165 non-null  object 
 6   episodes          23397 non-null  float64
 7   start_date        21391 non-null  object 
 8   end_date          19912 non-null  object 
 9   source            20404 non-null  object 
 10  members           24165 non-null  int64  
 11  favorites         24165 non-null  int64  
 12  episode_duration  23302 non-null  object 
 13  total_duration    23002 non-null  object 
 14  rating            23148 non-null  object 
 15  sfw               24165 non-null  bool   
 16  approved          24165 non-null  bool  

In [5]:
anime_df.shape

(24165, 39)

## Initial Thoughts

In [6]:
# supervised ML analysis: score by type, status, len (# episodes), source, rating/sfw, year, season
# broadcast_day, broadcast_time, genres/themes, demographics

# analysis: most popular genres/themes, most successful studios/producers/licensors

# keep: 
# anime_id, type, score, scored_by, status, episodes, start_date, source, rating, sfw, start_season,
#  broadcast_day, broadcast_time, genres, themes, demographics, studios, producers, licensors

# get_dummies: 
# type, status, source, rating, sfw, start_season, broadcast_day, genres, themes, demographics, studios,
# producers, licensors

# to_datetime: 
# start_date, broadcast_time

# drop: 
# title, end_date, members, favorites, episode_duration, total_duration, approved, created_at, 
# updated_at, start_year, real_start_date, real_end_date, synopsis, background, main_picture, url, trailer_url, 
# title_english, title_japanese, title_synonyms

## Preprocessing - Deal with Null Values

In [7]:
# set index to anime_id
anime_df.set_index("anime_id", inplace=True)
anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,status,episodes,start_date,end_date,source,members,...,producers,licensors,synopsis,background,main_picture,url,trailer_url,title_english,title_japanese,title_synonyms
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,finished_airing,64.0,2009-04-05,2010-07-04,manga,2932347,...,"['Aniplex', 'Square Enix', 'Mainichi Broadcast...","['Funimation', 'Aniplex of America']",After a horrific alchemy experiment goes wrong...,,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...,https://www.youtube.com/watch?v=--IcmZkvL0Q,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,['Hagane no Renkinjutsushi: Fullmetal Alchemis...
11061,Hunter x Hunter (2011),tv,9.04,1509622,finished_airing,148.0,2011-10-02,2014-09-24,manga,2418883,...,"['VAP', 'Nippon Television Network', 'Shueisha']",['VIZ Media'],Hunters devote themselves to accomplishing haz...,,https://cdn.myanimelist.net/images/anime/1337/...,https://myanimelist.net/anime/11061/Hunter_x_H...,https://www.youtube.com/watch?v=D9iTQRB4XRk,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,['HxH (2011)']
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,finished_airing,10.0,2019-04-29,2019-07-01,manga,1881734,...,"['Production I.G', 'Dentsu', 'Mainichi Broadca...",['Funimation'],Seeking to restore humanity's diminishing hope...,Shingeki no Kyojin adapts content from volumes...,https://cdn.myanimelist.net/images/anime/1517/...,https://myanimelist.net/anime/38524/Shingeki_n...,https://www.youtube.com/watch?v=hKHepjfj5Tw,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,[]
9253,Steins;Gate,tv,9.08,1252286,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2269121,...,"['Frontier Works', 'Media Factory', 'Movic', '...",['Funimation'],Eccentric scientist Rintarou Okabe has a never...,Steins;Gate is based on 5pb. and Nitroplus' re...,https://cdn.myanimelist.net/images/anime/5/731...,https://myanimelist.net/anime/9253/Steins_Gate,https://www.youtube.com/watch?v=27OZc-ku6is,Steins;Gate,STEINS;GATE,[]
28851,Koe no Katachi,movie,8.95,1398608,finished_airing,1.0,2016-09-17,2016-09-17,manga,2001335,...,"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","['Eleven Arts', 'NYAV Post']","As a wild youth, elementary school student Sho...",Winner of the Excellence Award on the 20th Jap...,https://cdn.myanimelist.net/images/anime/1122/...,https://myanimelist.net/anime/28851/Koe_no_Kat...,https://www.youtube.com/watch?v=XBNWo25izJ8,A Silent Voice,聲の形,['The Shape of Voice']


In [8]:
# drop useless columns
anime_df.drop(columns = ['end_date', 'members', 'favorites', 'episode_duration', 'total_duration', 'approved',
                        'created_at', 'updated_at', 'start_year', 'real_start_date', 'real_end_date', 'synopsis', 'background', 'main_picture', 'url', 'trailer_url',
                        'title_english', 'title_japanese', 'title_synonyms'], inplace=True)
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24165 entries, 5114 to 52477
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           24165 non-null  object 
 1   type            24092 non-null  object 
 2   score           14272 non-null  float64
 3   scored_by       24165 non-null  int64  
 4   status          24165 non-null  object 
 5   episodes        23397 non-null  float64
 6   start_date      21391 non-null  object 
 7   source          20404 non-null  object 
 8   rating          23148 non-null  object 
 9   sfw             24165 non-null  bool   
 10  start_season    17749 non-null  object 
 11  broadcast_day   3065 non-null   object 
 12  broadcast_time  2928 non-null   object 
 13  genres          24165 non-null  object 
 14  themes          24165 non-null  object 
 15  demographics    24165 non-null  object 
 16  studios         24165 non-null  object 
 17  producers       24165 non-nu

In [9]:
# find null values
for column in anime_df:
    print(f"{column} has {anime_df[column].isnull().sum()} null values")

title has 0 null values
type has 73 null values
score has 9893 null values
scored_by has 0 null values
status has 0 null values
episodes has 768 null values
start_date has 2774 null values
source has 3761 null values
rating has 1017 null values
sfw has 0 null values
start_season has 6416 null values
broadcast_day has 21100 null values
broadcast_time has 21237 null values
genres has 0 null values
themes has 0 null values
demographics has 0 null values
studios has 0 null values
producers has 0 null values
licensors has 0 null values


In [10]:
# drop null values in score column
anime_df.dropna(subset=['score'], inplace=True)
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14272 entries, 5114 to 30408
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           14272 non-null  object 
 1   type            14271 non-null  object 
 2   score           14272 non-null  float64
 3   scored_by       14272 non-null  int64  
 4   status          14272 non-null  object 
 5   episodes        14178 non-null  float64
 6   start_date      14261 non-null  object 
 7   source          12516 non-null  object 
 8   rating          14176 non-null  object 
 9   sfw             14272 non-null  bool   
 10  start_season    13561 non-null  object 
 11  broadcast_day   2742 non-null   object 
 12  broadcast_time  2672 non-null   object 
 13  genres          14272 non-null  object 
 14  themes          14272 non-null  object 
 15  demographics    14272 non-null  object 
 16  studios         14272 non-null  object 
 17  producers       14272 non-nu

In [11]:
# find null values
for column in anime_df:
    print(f"{column} has {anime_df[column].isnull().sum()} null values")

title has 0 null values
type has 1 null values
score has 0 null values
scored_by has 0 null values
status has 0 null values
episodes has 94 null values
start_date has 11 null values
source has 1756 null values
rating has 96 null values
sfw has 0 null values
start_season has 711 null values
broadcast_day has 11530 null values
broadcast_time has 11600 null values
genres has 0 null values
themes has 0 null values
demographics has 0 null values
studios has 0 null values
producers has 0 null values
licensors has 0 null values


In [12]:
# broadcast_day and broadcast_time have a ton of null values, which means they won't work for analysis - remove
# drop broadcast_day and broadcast_time columns and view null values again
anime_df.drop(columns = ['broadcast_day', 'broadcast_time'], inplace=True)
for column in anime_df:
    print(f"{column} has {anime_df[column].isnull().sum()} null values")

title has 0 null values
type has 1 null values
score has 0 null values
scored_by has 0 null values
status has 0 null values
episodes has 94 null values
start_date has 11 null values
source has 1756 null values
rating has 96 null values
sfw has 0 null values
start_season has 711 null values
genres has 0 null values
themes has 0 null values
demographics has 0 null values
studios has 0 null values
producers has 0 null values
licensors has 0 null values


In [13]:
# still have a fair number of null values under source, check total number of rows to see if they can be removed while still having a good number
anime_df.shape

(14272, 17)

In [14]:
# drop remaining null values and confirm removed
anime_df.dropna(inplace=True)
for column in anime_df:
    print(f"{column} has {anime_df[column].isnull().sum()} null values")

title has 0 null values
type has 0 null values
score has 0 null values
scored_by has 0 null values
status has 0 null values
episodes has 0 null values
start_date has 0 null values
source has 0 null values
rating has 0 null values
sfw has 0 null values
start_season has 0 null values
genres has 0 null values
themes has 0 null values
demographics has 0 null values
studios has 0 null values
producers has 0 null values
licensors has 0 null values


### Conclusion to Section

In [15]:
# supervised ML analysis: score by type, status, len (# episodes), source, rating/sfw, start_date, season, genres/themes, demographics

# analysis: most popular genres/themes, most successful studios/producers/licensors

# keep: 
# anime_id, type, score, scored_by, status, episodes, start_date, source, rating, sfw, start_season,
# genres, themes, demographics, studios, producers, licensors

# get_dummies: 
# type, status, source, rating, sfw, start_season, broadcast_day, genres, themes, demographics, studios,
# producers, licensors

# to_datetime: 
# start_date

# drop: 
# title, end_date, members, favorites, episode_duration, total_duration, approved, created_at, 
# updated_at, start_year, real_start_date, real_end_date, synopsis, background, main_picture, url, trailer_url, 
# title_english, title_japanese, title_synonyms, broadcast_day, broadcast_time

## Preprocessing - Deal with Categories

### Type

In [16]:
# type
anime_df['type'].value_counts()

tv         3798
ova        2658
special    1518
movie      1511
ona        1387
music       951
Name: type, dtype: int64

#### Remove music videos

In [17]:
anime_df['type'] != 'music'

anime_id
5114      True
11061     True
38524     True
9253      True
28851     True
         ...  
38066     True
35514    False
36113    False
37336     True
40361     True
Name: type, Length: 11823, dtype: bool

In [18]:
types_anime_df = anime_df.loc[(anime_df['type'] != 'music')]
types_anime_df['type'] != 'music'

anime_id
5114     True
11061    True
38524    True
9253     True
28851    True
         ... 
42234    True
38771    True
38066    True
37336    True
40361    True
Name: type, Length: 10872, dtype: bool

In [19]:
types_anime_df.shape

(10872, 17)

### Source

In [20]:
# source
types_anime_df['source'].value_counts()

manga           3763
original        2892
visual_novel    1059
light_novel      804
game             756
novel            468
other            356
4_koma_manga     263
web_manga        245
book              73
card_game         59
mixed_media       48
picture_book      42
music             26
web_novel         13
radio              5
Name: source, dtype: int64

#### Remove sources with counts under 100

In [21]:
# code from stackoverflow https://stackoverflow.com/questions/49735683/python-removing-rows-on-count-condition
source_anime_df = types_anime_df[types_anime_df.groupby('source').
                                source.transform('count')>100]
source_anime_df['source'].value_counts()

manga           3763
original        2892
visual_novel    1059
light_novel      804
game             756
novel            468
other            356
4_koma_manga     263
web_manga        245
Name: source, dtype: int64

In [22]:
source_anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,status,episodes,start_date,source,rating,sfw,start_season,genres,themes,demographics,studios,producers,licensors
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,finished_airing,64.0,2009-04-05,manga,r,True,spring,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...","['Funimation', 'Aniplex of America']"
11061,Hunter x Hunter (2011),tv,9.04,1509622,finished_airing,148.0,2011-10-02,manga,pg_13,True,fall,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",['VIZ Media']
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,finished_airing,10.0,2019-04-29,manga,r,True,spring,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",['Funimation']
9253,Steins;Gate,tv,9.08,1252286,finished_airing,24.0,2011-04-06,visual_novel,pg_13,True,spring,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Movic', '...",['Funimation']
28851,Koe no Katachi,movie,8.95,1398608,finished_airing,1.0,2016-09-17,manga,pg_13,True,summer,['Drama'],['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","['Eleven Arts', 'NYAV Post']"


### Status

In [23]:
source_anime_df['status'].value_counts()

finished_airing     10554
currently_airing       52
Name: status, dtype: int64

In [24]:
# most are finished airing, so this is not statistically relevant

In [28]:
source_anime_df.drop(columns = ['status'], inplace=True)
source_anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,genres,themes,demographics,studios,producers,licensors
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...","['Funimation', 'Aniplex of America']"
11061,Hunter x Hunter (2011),tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",['VIZ Media']
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",['Funimation']
9253,Steins;Gate,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Movic', '...",['Funimation']
28851,Koe no Katachi,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,['Drama'],['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","['Eleven Arts', 'NYAV Post']"


### SFW

In [29]:
source_anime_df['sfw'].value_counts()

True     9323
False    1283
Name: sfw, dtype: int64

### Themes, Genres, and other lists

In [30]:
# genres, themes, demographics, studios, producers, licensors are all lists

In [31]:
# Convert from strings into lists
# https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173

In [32]:
# Genres
source_anime_df['genres'] = source_anime_df['genres'].apply(eval)
source_anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,genres,themes,demographics,studios,producers,licensors
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,"[Action, Adventure, Drama, Fantasy]",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...","['Funimation', 'Aniplex of America']"
11061,Hunter x Hunter (2011),tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,"[Action, Adventure, Fantasy]",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",['VIZ Media']
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,"[Action, Drama]","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",['Funimation']
9253,Steins;Gate,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,"[Drama, Sci-Fi, Suspense]","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Movic', '...",['Funimation']
28851,Koe no Katachi,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,[Drama],['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","['Eleven Arts', 'NYAV Post']"


In [33]:
# Themes
source_anime_df['themes'] = source_anime_df['themes'].apply(eval)
source_anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,genres,themes,demographics,studios,producers,licensors
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,"[Action, Adventure, Drama, Fantasy]",[Military],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...","['Funimation', 'Aniplex of America']"
11061,Hunter x Hunter (2011),tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,"[Action, Adventure, Fantasy]",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",['VIZ Media']
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,"[Action, Drama]","[Gore, Military, Survival]",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",['Funimation']
9253,Steins;Gate,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,"[Drama, Sci-Fi, Suspense]","[Psychological, Time Travel]",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Movic', '...",['Funimation']
28851,Koe no Katachi,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,[Drama],[Romantic Subtext],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","['Eleven Arts', 'NYAV Post']"


In [34]:
# Demographics
source_anime_df['demographics'] = source_anime_df['demographics'].apply(eval)
source_anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,genres,themes,demographics,studios,producers,licensors
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...","['Funimation', 'Aniplex of America']"
11061,Hunter x Hunter (2011),tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,"[Action, Adventure, Fantasy]",[],[Shounen],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",['VIZ Media']
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,"[Action, Drama]","[Gore, Military, Survival]",[Shounen],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",['Funimation']
9253,Steins;Gate,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,"[Drama, Sci-Fi, Suspense]","[Psychological, Time Travel]",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Movic', '...",['Funimation']
28851,Koe no Katachi,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,[Drama],[Romantic Subtext],[Shounen],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","['Eleven Arts', 'NYAV Post']"


In [35]:
# Studios
source_anime_df['studios'] = source_anime_df['studios'].apply(eval)
source_anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,genres,themes,demographics,studios,producers,licensors
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],[Bones],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...","['Funimation', 'Aniplex of America']"
11061,Hunter x Hunter (2011),tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,"[Action, Adventure, Fantasy]",[],[Shounen],[Madhouse],"['VAP', 'Nippon Television Network', 'Shueisha']",['VIZ Media']
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,"[Action, Drama]","[Gore, Military, Survival]",[Shounen],[Wit Studio],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",['Funimation']
9253,Steins;Gate,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,"[Drama, Sci-Fi, Suspense]","[Psychological, Time Travel]",[],[White Fox],"['Frontier Works', 'Media Factory', 'Movic', '...",['Funimation']
28851,Koe no Katachi,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,[Drama],[Romantic Subtext],[Shounen],[Kyoto Animation],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","['Eleven Arts', 'NYAV Post']"


In [36]:
# Producers
source_anime_df['producers'] = source_anime_df['producers'].apply(eval)
source_anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,genres,themes,demographics,studios,producers,licensors
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","['Funimation', 'Aniplex of America']"
11061,Hunter x Hunter (2011),tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,"[Action, Adventure, Fantasy]",[],[Shounen],[Madhouse],"[VAP, Nippon Television Network, Shueisha]",['VIZ Media']
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,"[Action, Drama]","[Gore, Military, Survival]",[Shounen],[Wit Studio],"[Production I.G, Dentsu, Mainichi Broadcasting...",['Funimation']
9253,Steins;Gate,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,"[Drama, Sci-Fi, Suspense]","[Psychological, Time Travel]",[],[White Fox],"[Frontier Works, Media Factory, Movic, AT-X, K...",['Funimation']
28851,Koe no Katachi,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,[Drama],[Romantic Subtext],[Shounen],[Kyoto Animation],"[Shochiku, Pony Canyon, Kodansha, ABC Animatio...","['Eleven Arts', 'NYAV Post']"


In [37]:
# Licensors
source_anime_df['licensors'] = source_anime_df['licensors'].apply(eval)
source_anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,genres,themes,demographics,studios,producers,licensors
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]"
11061,Hunter x Hunter (2011),tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,"[Action, Adventure, Fantasy]",[],[Shounen],[Madhouse],"[VAP, Nippon Television Network, Shueisha]",[VIZ Media]
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,"[Action, Drama]","[Gore, Military, Survival]",[Shounen],[Wit Studio],"[Production I.G, Dentsu, Mainichi Broadcasting...",[Funimation]
9253,Steins;Gate,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,"[Drama, Sci-Fi, Suspense]","[Psychological, Time Travel]",[],[White Fox],"[Frontier Works, Media Factory, Movic, AT-X, K...",[Funimation]
28851,Koe no Katachi,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,[Drama],[Romantic Subtext],[Shounen],[Kyoto Animation],"[Shochiku, Pony Canyon, Kodansha, ABC Animatio...","[Eleven Arts, NYAV Post]"


### Combine Genres and Themes into Keywords Column

In [38]:
source_anime_df['keywords'] = (source_anime_df['genres'] + source_anime_df['themes'])
source_anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,genres,themes,demographics,studios,producers,licensors,keywords
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]","[Action, Adventure, Drama, Fantasy, Military]"
11061,Hunter x Hunter (2011),tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,"[Action, Adventure, Fantasy]",[],[Shounen],[Madhouse],"[VAP, Nippon Television Network, Shueisha]",[VIZ Media],"[Action, Adventure, Fantasy]"
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,"[Action, Drama]","[Gore, Military, Survival]",[Shounen],[Wit Studio],"[Production I.G, Dentsu, Mainichi Broadcasting...",[Funimation],"[Action, Drama, Gore, Military, Survival]"
9253,Steins;Gate,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,"[Drama, Sci-Fi, Suspense]","[Psychological, Time Travel]",[],[White Fox],"[Frontier Works, Media Factory, Movic, AT-X, K...",[Funimation],"[Drama, Sci-Fi, Suspense, Psychological, Time ..."
28851,Koe no Katachi,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,[Drama],[Romantic Subtext],[Shounen],[Kyoto Animation],"[Shochiku, Pony Canyon, Kodansha, ABC Animatio...","[Eleven Arts, NYAV Post]","[Drama, Romantic Subtext]"


In [39]:
keywords_anime_df = source_anime_df.drop(columns = ['genres', 'themes'])
keywords_anime_df.head()

Unnamed: 0_level_0,title,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,demographics,studios,producers,licensors,keywords
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]","[Action, Adventure, Drama, Fantasy, Military]"
11061,Hunter x Hunter (2011),tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,[Shounen],[Madhouse],"[VAP, Nippon Television Network, Shueisha]",[VIZ Media],"[Action, Adventure, Fantasy]"
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,[Shounen],[Wit Studio],"[Production I.G, Dentsu, Mainichi Broadcasting...",[Funimation],"[Action, Drama, Gore, Military, Survival]"
9253,Steins;Gate,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,[],[White Fox],"[Frontier Works, Media Factory, Movic, AT-X, K...",[Funimation],"[Drama, Sci-Fi, Suspense, Psychological, Time ..."
28851,Koe no Katachi,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,[Shounen],[Kyoto Animation],"[Shochiku, Pony Canyon, Kodansha, ABC Animatio...","[Eleven Arts, NYAV Post]","[Drama, Romantic Subtext]"


In [40]:
keywords_anime_df['keywords'].head()

anime_id
5114         [Action, Adventure, Drama, Fantasy, Military]
11061                         [Action, Adventure, Fantasy]
38524            [Action, Drama, Gore, Military, Survival]
9253     [Drama, Sci-Fi, Suspense, Psychological, Time ...
28851                            [Drama, Romantic Subtext]
Name: keywords, dtype: object

### Use to_datetime on start_date

In [41]:
keywords_anime_df['start_date'] = pd.to_datetime(keywords_anime_df['start_date'])
print(keywords_anime_df.dtypes)
keywords_anime_df.head()

title                   object
type                    object
score                  float64
scored_by                int64
episodes               float64
start_date      datetime64[ns]
source                  object
rating                  object
sfw                       bool
start_season            object
demographics            object
studios                 object
producers               object
licensors               object
keywords                object
dtype: object


Unnamed: 0_level_0,title,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,demographics,studios,producers,licensors,keywords
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]","[Action, Adventure, Drama, Fantasy, Military]"
11061,Hunter x Hunter (2011),tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,[Shounen],[Madhouse],"[VAP, Nippon Television Network, Shueisha]",[VIZ Media],"[Action, Adventure, Fantasy]"
38524,Shingeki no Kyojin Season 3 Part 2,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,[Shounen],[Wit Studio],"[Production I.G, Dentsu, Mainichi Broadcasting...",[Funimation],"[Action, Drama, Gore, Military, Survival]"
9253,Steins;Gate,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,[],[White Fox],"[Frontier Works, Media Factory, Movic, AT-X, K...",[Funimation],"[Drama, Sci-Fi, Suspense, Psychological, Time ..."
28851,Koe no Katachi,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,[Shounen],[Kyoto Animation],"[Shochiku, Pony Canyon, Kodansha, ABC Animatio...","[Eleven Arts, NYAV Post]","[Drama, Romantic Subtext]"


### Examine demographics further

In [42]:
keywords_anime_df['demographics'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[]                 7221
[Shounen]          1585
[Seinen]            728
[Shoujo]            551
[Kids]              380
[Josei]              93
[Kids, Shounen]      25
[Kids, Shoujo]       21
[Kids, Seinen]        1
[Josei, Shoujo]       1
Name: demographics, dtype: int64

In [43]:
# 66% do not have demographics attributed to them. Delete this column? Alternative is to pull anime data from Wikipedia and compare, but I would expect MAL to be better data than Wikipedia.
# There may be some additional data to pull from Wikipedia like directors and networks - is that relevant? 

## Clean CSVs - Anime Titles

In [44]:
# create df that holds anime_id and title
anime_titles_df = keywords_anime_df.copy()
anime_titles_df = pd.DataFrame(anime_titles_df['title'], index=keywords_anime_df.index)
anime_titles_df.head()

Unnamed: 0_level_0,title
anime_id,Unnamed: 1_level_1
5114,Fullmetal Alchemist: Brotherhood
11061,Hunter x Hunter (2011)
38524,Shingeki no Kyojin Season 3 Part 2
9253,Steins;Gate
28851,Koe no Katachi


In [45]:
# save anime_titles_df as csv
output_file_path = "./data/anime_titles.csv"
anime_titles_df.to_csv(output_file_path, index=True)

In [46]:
# drop anime_titles from df
clean_anime_df = keywords_anime_df.drop(columns = ["title"])
clean_anime_df.head()

Unnamed: 0_level_0,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,demographics,studios,producers,licensors,keywords
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5114,tv,9.13,1871705,64.0,2009-04-05,manga,r,True,spring,[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]","[Action, Adventure, Drama, Fantasy, Military]"
11061,tv,9.04,1509622,148.0,2011-10-02,manga,pg_13,True,fall,[Shounen],[Madhouse],"[VAP, Nippon Television Network, Shueisha]",[VIZ Media],"[Action, Adventure, Fantasy]"
38524,tv,9.07,1329500,10.0,2019-04-29,manga,r,True,spring,[Shounen],[Wit Studio],"[Production I.G, Dentsu, Mainichi Broadcasting...",[Funimation],"[Action, Drama, Gore, Military, Survival]"
9253,tv,9.08,1252286,24.0,2011-04-06,visual_novel,pg_13,True,spring,[],[White Fox],"[Frontier Works, Media Factory, Movic, AT-X, K...",[Funimation],"[Drama, Sci-Fi, Suspense, Psychological, Time ..."
28851,movie,8.95,1398608,1.0,2016-09-17,manga,pg_13,True,summer,[Shounen],[Kyoto Animation],"[Shochiku, Pony Canyon, Kodansha, ABC Animatio...","[Eleven Arts, NYAV Post]","[Drama, Romantic Subtext]"


In [47]:
# save clean_anime_df as csv
output_file_path = "./data/clean_anime.csv"
clean_anime_df.to_csv(output_file_path, index=True)

## Next Steps

In [48]:
# get_dummies: 
# type, status, source, rating, sfw, start_season, demographics, studios,
# producers, licensors, keywords

In [49]:
clean_anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10606 entries, 5114 to 40361
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   type          10606 non-null  object        
 1   score         10606 non-null  float64       
 2   scored_by     10606 non-null  int64         
 3   episodes      10606 non-null  float64       
 4   start_date    10606 non-null  datetime64[ns]
 5   source        10606 non-null  object        
 6   rating        10606 non-null  object        
 7   sfw           10606 non-null  bool          
 8   start_season  10606 non-null  object        
 9   demographics  10606 non-null  object        
 10  studios       10606 non-null  object        
 11  producers     10606 non-null  object        
 12  licensors     10606 non-null  object        
 13  keywords      10606 non-null  object        
dtypes: bool(1), datetime64[ns](1), float64(2), int64(1), object(9)
memory usage: 1.1+ M