In [1]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download(['punkt', 'stopwords', 'wordnet', 'omw-1.4', 'averaged_perceptron_tagger'])
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package punkt to /home/jakeli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jakeli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jakeli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jakeli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jakeli/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Import Data

In [2]:
df = pd.read_csv('data/anime_data.csv', parse_dates=['start_date', 'created_at', 'updated_at', 'end_date'])
df.head()

Unnamed: 0,ranking,id,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,end_date
0,{'rank': 1},52034,"""Oshi no Ko""",2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.32,1,1067,206347,47027,white,2022-06-09 13:01:38+00:00,2023-04-14 04:49:00+00:00,tv,currently_airing,"[{'id': 8, 'name': 'Drama'}, {'id': 72, 'name'...",11,manga,0,pg_13,"[{'id': 95, 'name': 'Doga Kobo'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['My Star'],[Oshi No Ko],【推しの子】,2023.0,spring,wednesday,23:00,
1,{'rank': 2},5114,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3129768,1989539,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",64,manga,1460,r,"[{'id': 4, 'name': 'Bones'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Hagane no Renkinjutsushi: Fullmetal Alchemis...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009.0,spring,sunday,17:00,2010-07-04
2,{'rank': 3},51535,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,3,535,387230,125358,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",2,manga,3690,r,"[{'id': 569, 'name': 'MAPPA'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Shingeki no Kyojin: The Final Season Part 3'...,Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023.0,winter,,,2023
3,{'rank': 4},9253,Steins;Gate,2011-04-06,Eccentric scientist Rintarou Okabe has a never...,9.08,4,13,2407610,1319395,white,2010-07-26 09:23:40+00:00,2023-04-02 18:08:42+00:00,tv,finished_airing,"[{'id': 8, 'name': 'Drama'}, {'id': 40, 'name'...",24,visual_novel,1460,pg_13,"[{'id': 314, 'name': 'White Fox'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,[],Steins;Gate,STEINS;GATE,2011.0,spring,wednesday,02:05,2011-09-14
4,{'rank': 5},41467,Bleach: Sennen Kessen-hen,2022-10-11,Substitute Soul Reaper Ichigo Kurosaki spends ...,9.08,5,494,417393,194063,white,2020-03-18 09:10:15+00:00,2023-04-02 18:07:55+00:00,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",13,manga,1471,r,"[{'id': 1, 'name': 'Pierrot'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Bleach: Thousand-Year Blood War Arc'],Bleach: Thousand-Year Blood War,BLEACH 千年血戦篇,2022.0,fall,tuesday,00:00,2022-12-27


In [3]:
df.shape

(20000, 31)

# Data Cleaning and EDA

In [4]:
# drop the ranking column since we got rank already
df.drop('ranking', axis=1, inplace=True)

df.head(2)

Unnamed: 0,id,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,end_date
0,52034,"""Oshi no Ko""",2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.32,1,1067,206347,47027,white,2022-06-09 13:01:38+00:00,2023-04-14 04:49:00+00:00,tv,currently_airing,"[{'id': 8, 'name': 'Drama'}, {'id': 72, 'name'...",11,manga,0,pg_13,"[{'id': 95, 'name': 'Doga Kobo'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['My Star'],[Oshi No Ko],【推しの子】,2023.0,spring,wednesday,23:00,
1,5114,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3129768,1989539,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",64,manga,1460,r,"[{'id': 4, 'name': 'Bones'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Hagane no Renkinjutsushi: Fullmetal Alchemis...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009.0,spring,sunday,17:00,2010-07-04


In [5]:
# remove brackets and quotation marks from titles
df['title'] = df['title'].apply(lambda x: re.sub('[\[\]\'"]', '', x))
df['alternative_titles.synonyms'] = df['alternative_titles.synonyms'].apply(lambda x: re.sub('[\[\]\'"]', '', x))
df['alternative_titles.en'] = df['alternative_titles.en'].apply(lambda x: re.sub('[\[\]\'"]', '', str(x)))
df['alternative_titles.ja'] = df['alternative_titles.ja'].apply(lambda x: re.sub('[\[\]\'"]', '', str(x)))

df.head(3)

Unnamed: 0,id,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,end_date
0,52034,Oshi no Ko,2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.32,1,1067,206347,47027,white,2022-06-09 13:01:38+00:00,2023-04-14 04:49:00+00:00,tv,currently_airing,"[{'id': 8, 'name': 'Drama'}, {'id': 72, 'name'...",11,manga,0,pg_13,"[{'id': 95, 'name': 'Doga Kobo'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,My Star,Oshi No Ko,【推しの子】,2023.0,spring,wednesday,23:00,
1,5114,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3129768,1989539,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",64,manga,1460,r,"[{'id': 4, 'name': 'Bones'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009.0,spring,sunday,17:00,2010-07-04
2,51535,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,3,535,387230,125358,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",2,manga,3690,r,"[{'id': 569, 'name': 'MAPPA'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Shingeki no Kyojin: The Final Season Part 3, S...",Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023.0,winter,,,2023


In [6]:
# get rid of unnecessary characters in the genres and studios column
df['genres'] = df['genres'].apply(lambda x: re.sub('[\[\]{}\'0-9:,]', '', str(x)))
df['genres'] = df['genres'].apply(lambda x: str(x).replace('id', '').replace('name', ''))

df['studios'] = df['studios'].apply(lambda x: re.sub('[\[\]{}\'0-9:,]', '', str(x)))
df['studios'] = df['studios'].apply(lambda x: str(x).replace('id', '').replace('name', ''))

df.head(3)

Unnamed: 0,id,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,end_date
0,52034,Oshi no Ko,2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.32,1,1067,206347,47027,white,2022-06-09 13:01:38+00:00,2023-04-14 04:49:00+00:00,tv,currently_airing,Drama Reincarnation Seinen Showbiz...,11,manga,0,pg_13,Doga Kobo,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,My Star,Oshi No Ko,【推しの子】,2023.0,spring,wednesday,23:00,
1,5114,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3129768,1989539,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,Action Adventure Drama Fantasy ...,64,manga,1460,r,Bones,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009.0,spring,sunday,17:00,2010-07-04
2,51535,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,3,535,387230,125358,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,Action Drama Gore Military Shou...,2,manga,3690,r,MAPPA,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Shingeki no Kyojin: The Final Season Part 3, S...",Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023.0,winter,,,2023


In [7]:
df.dtypes

id                                           int64
title                                       object
start_date                                  object
synopsis                                    object
mean                                       float64
rank                                         int64
popularity                                   int64
num_list_users                               int64
num_scoring_users                            int64
nsfw                                        object
created_at                     datetime64[ns, UTC]
updated_at                     datetime64[ns, UTC]
media_type                                  object
status                                      object
genres                                      object
num_episodes                                 int64
source                                      object
average_episode_duration                     int64
rating                                      object
studios                        

In [8]:
# convert columns to the best possible dtypes using convert_dtypes function
df = df.convert_dtypes()
df.dtypes

id                                           Int64
title                               string[python]
start_date                          string[python]
synopsis                            string[python]
mean                                       Float64
rank                                         Int64
popularity                                   Int64
num_list_users                               Int64
num_scoring_users                            Int64
nsfw                                string[python]
created_at                     datetime64[ns, UTC]
updated_at                     datetime64[ns, UTC]
media_type                          string[python]
status                              string[python]
genres                              string[python]
num_episodes                                 Int64
source                              string[python]
average_episode_duration                     Int64
rating                              string[python]
studios                        

In [9]:
df.head(3)

Unnamed: 0,id,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,end_date
0,52034,Oshi no Ko,2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.32,1,1067,206347,47027,white,2022-06-09 13:01:38+00:00,2023-04-14 04:49:00+00:00,tv,currently_airing,Drama Reincarnation Seinen Showbiz...,11,manga,0,pg_13,Doga Kobo,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,My Star,Oshi No Ko,【推しの子】,2023,spring,wednesday,23:00,
1,5114,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3129768,1989539,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,Action Adventure Drama Fantasy ...,64,manga,1460,r,Bones,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009,spring,sunday,17:00,2010-07-04
2,51535,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,3,535,387230,125358,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,Action Drama Gore Military Shou...,2,manga,3690,r,MAPPA,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Shingeki no Kyojin: The Final Season Part 3, S...",Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023,winter,,,2023


In [10]:
# check duplicates
df.duplicated().sum()

0

In [11]:
# check nulls
df.isnull().sum()

id                                 0
title                              0
start_date                       466
synopsis                        2897
mean                            6045
rank                               0
popularity                         0
num_list_users                     0
num_scoring_users                  0
nsfw                               0
created_at                         0
updated_at                         0
media_type                         0
status                             0
genres                             0
num_episodes                       0
source                          3055
average_episode_duration           0
rating                           290
studios                            0
main_picture.medium               98
main_picture.large                98
alternative_titles.synonyms        0
alternative_titles.en              0
alternative_titles.ja              0
start_season.year               3169
start_season.season             3169
b

In [12]:
df.shape

(20000, 30)

In [13]:
# drop rows where the synopsis/plot is null because it's an important feature for content-based-filtering
df.dropna(subset=['synopsis'], inplace=True)
# drop the broadcast.day_of_the_week and broadcast.start_time columns since a lot of data are missing
df.drop(['broadcast.day_of_the_week', 'broadcast.start_time'], axis=1, inplace=True)

In [14]:
df.shape

(17103, 28)

In [15]:
# check the range of year
print('Minimum year:', df['start_season.year'].min())
print('Maximum year:', df['start_season.year'].max())

Minimum year: 1917
Maximum year: 2023


In [16]:
# filter out anime that aired before 2000 because I don't want to recommend animes that's too old.
df = df[df['start_season.year'] >= 2000]
df.shape

(12338, 28)

In [17]:
df.index

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
       19945, 19946, 19951, 19962, 19963, 19964, 19965, 19966, 19971, 19975],
      dtype='int64', length=12338)

In [18]:
# fix index
df.index = pd.RangeIndex(start=0, stop=0+len(df), step=1)
df.index

RangeIndex(start=0, stop=12338, step=1)

In [19]:
df.columns

Index(['id', 'title', 'start_date', 'synopsis', 'mean', 'rank', 'popularity',
       'num_list_users', 'num_scoring_users', 'nsfw', 'created_at',
       'updated_at', 'media_type', 'status', 'genres', 'num_episodes',
       'source', 'average_episode_duration', 'rating', 'studios',
       'main_picture.medium', 'main_picture.large',
       'alternative_titles.synonyms', 'alternative_titles.en',
       'alternative_titles.ja', 'start_season.year', 'start_season.season',
       'end_date'],
      dtype='object')

In [20]:
# rename columns
df = df.rename(columns={'main_picture.medium':'medium_picture_url', 'main_picture.large':'large_picture_url',
                        'alternative_titles.synonyms':'alternative_titles','alternative_titles.en':'english_title', 
                        'alternative_titles.ja':'japanese_title', 'start_season.year':'year', 'start_season.season':'season'})

df.head(3)

Unnamed: 0,id,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,medium_picture_url,large_picture_url,alternative_titles,english_title,japanese_title,year,season,end_date
0,52034,Oshi no Ko,2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.32,1,1067,206347,47027,white,2022-06-09 13:01:38+00:00,2023-04-14 04:49:00+00:00,tv,currently_airing,Drama Reincarnation Seinen Showbiz...,11,manga,0,pg_13,Doga Kobo,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,My Star,Oshi No Ko,【推しの子】,2023,spring,
1,5114,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3129768,1989539,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,Action Adventure Drama Fantasy ...,64,manga,1460,r,Bones,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009,spring,2010-07-04
2,51535,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,3,535,387230,125358,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,Action Drama Gore Military Shou...,2,manga,3690,r,MAPPA,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Shingeki no Kyojin: The Final Season Part 3, S...",Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023,winter,2023


# Text Preprocessing

In [21]:
df.columns

Index(['id', 'title', 'start_date', 'synopsis', 'mean', 'rank', 'popularity',
       'num_list_users', 'num_scoring_users', 'nsfw', 'created_at',
       'updated_at', 'media_type', 'status', 'genres', 'num_episodes',
       'source', 'average_episode_duration', 'rating', 'studios',
       'medium_picture_url', 'large_picture_url', 'alternative_titles',
       'english_title', 'japanese_title', 'year', 'season', 'end_date'],
      dtype='object')

In [22]:
# select and combine features that I think it's going to be useful for the recommendation engine
features = ['synopsis', 'genres']

df['summary'] = df[features].apply(lambda x: ', '.join(x.astype(str)), axis=1)

print(df['summary'][0])

Sixteen-year-old Ai Hoshino is a talented and beautiful idol who is adored by her fans. She is the personification of a pure, young maiden. But all that glitters is not gold.

Gorou Amemiya is a countryside gynecologist and a big fan of Ai. So when the pregnant idol shows up at his hospital, he is beyond bewildered. Gorou promises her a safe delivery. Little does he know, an encounter with a mysterious figure would result in his untimely death—or so he thought.

Opening his eyes in the lap of his beloved idol, Gorou finds that he has been reborn as Aquamarine Hoshino—Ai's newborn son! With his world turned upside down, Gorou soon learns that the world of showbiz is paved with thorns, where talent does not always beget success. Will he manage to protect Ai's smile that he loves so much with the help of an eccentric and unexpected ally? 

[Written by MAL Rewrite],    Drama    Reincarnation    Seinen    Showbiz    Supernatural


In [23]:
# apply lower case and tokenize
df['tokens'] = df['summary'].apply(str.lower).apply(word_tokenize)

print(df['tokens'][0])

['sixteen-year-old', 'ai', 'hoshino', 'is', 'a', 'talented', 'and', 'beautiful', 'idol', 'who', 'is', 'adored', 'by', 'her', 'fans', '.', 'she', 'is', 'the', 'personification', 'of', 'a', 'pure', ',', 'young', 'maiden', '.', 'but', 'all', 'that', 'glitters', 'is', 'not', 'gold', '.', 'gorou', 'amemiya', 'is', 'a', 'countryside', 'gynecologist', 'and', 'a', 'big', 'fan', 'of', 'ai', '.', 'so', 'when', 'the', 'pregnant', 'idol', 'shows', 'up', 'at', 'his', 'hospital', ',', 'he', 'is', 'beyond', 'bewildered', '.', 'gorou', 'promises', 'her', 'a', 'safe', 'delivery', '.', 'little', 'does', 'he', 'know', ',', 'an', 'encounter', 'with', 'a', 'mysterious', 'figure', 'would', 'result', 'in', 'his', 'untimely', 'death—or', 'so', 'he', 'thought', '.', 'opening', 'his', 'eyes', 'in', 'the', 'lap', 'of', 'his', 'beloved', 'idol', ',', 'gorou', 'finds', 'that', 'he', 'has', 'been', 'reborn', 'as', 'aquamarine', 'hoshino—ai', "'s", 'newborn', 'son', '!', 'with', 'his', 'world', 'turned', 'upside', '

In [24]:
# remove stop words and keep letters only
stop_words = stopwords.words('english')
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words and word.isalpha()])

print(df['tokens'][0])

['ai', 'hoshino', 'talented', 'beautiful', 'idol', 'adored', 'fans', 'personification', 'pure', 'young', 'maiden', 'glitters', 'gold', 'gorou', 'amemiya', 'countryside', 'gynecologist', 'big', 'fan', 'ai', 'pregnant', 'idol', 'shows', 'hospital', 'beyond', 'bewildered', 'gorou', 'promises', 'safe', 'delivery', 'little', 'know', 'encounter', 'mysterious', 'figure', 'would', 'result', 'untimely', 'thought', 'opening', 'eyes', 'lap', 'beloved', 'idol', 'gorou', 'finds', 'reborn', 'aquamarine', 'newborn', 'son', 'world', 'turned', 'upside', 'gorou', 'soon', 'learns', 'world', 'showbiz', 'paved', 'thorns', 'talent', 'always', 'beget', 'success', 'manage', 'protect', 'ai', 'smile', 'loves', 'much', 'help', 'eccentric', 'unexpected', 'ally', 'written', 'mal', 'rewrite', 'drama', 'reincarnation', 'seinen', 'showbiz', 'supernatural']


In [25]:
#  apply tag to each token
df['tokens'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))
print(df['tokens'][0])

[('ai', 'NN'), ('hoshino', 'NN'), ('talented', 'VBD'), ('beautiful', 'JJ'), ('idol', 'NN'), ('adored', 'VBD'), ('fans', 'NNS'), ('personification', 'NN'), ('pure', 'NN'), ('young', 'JJ'), ('maiden', 'NN'), ('glitters', 'NNS'), ('gold', 'VBP'), ('gorou', 'JJ'), ('amemiya', 'JJ'), ('countryside', 'NN'), ('gynecologist', 'NN'), ('big', 'JJ'), ('fan', 'NN'), ('ai', 'NN'), ('pregnant', 'JJ'), ('idol', 'NN'), ('shows', 'NNS'), ('hospital', 'VBP'), ('beyond', 'IN'), ('bewildered', 'VBN'), ('gorou', 'NN'), ('promises', 'NNS'), ('safe', 'JJ'), ('delivery', 'NN'), ('little', 'RB'), ('know', 'JJ'), ('encounter', 'RB'), ('mysterious', 'JJ'), ('figure', 'NN'), ('would', 'MD'), ('result', 'VB'), ('untimely', 'RB'), ('thought', 'VBN'), ('opening', 'VBG'), ('eyes', 'NNS'), ('lap', 'RB'), ('beloved', 'VBD'), ('idol', 'JJ'), ('gorou', 'NN'), ('finds', 'VBZ'), ('reborn', 'JJ'), ('aquamarine', 'JJ'), ('newborn', 'VBN'), ('son', 'NN'), ('world', 'NN'), ('turned', 'VBD'), ('upside', 'JJ'), ('gorou', 'NN'), 

In [26]:
# convert tags to the tags that WordNetLemmatizer uses
def convert_pos(pos):
    if pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# lemmatize words with pos-tags
df['cleaned_tokens'] = df['tokens'].apply(lambda x: [WordNetLemmatizer().lemmatize(w[0], pos=convert_pos(w[1])) for w in x])
print(df['cleaned_tokens'][0])

['ai', 'hoshino', 'talented', 'beautiful', 'idol', 'adore', 'fan', 'personification', 'pure', 'young', 'maiden', 'glitter', 'gold', 'gorou', 'amemiya', 'countryside', 'gynecologist', 'big', 'fan', 'ai', 'pregnant', 'idol', 'show', 'hospital', 'beyond', 'bewilder', 'gorou', 'promise', 'safe', 'delivery', 'little', 'know', 'encounter', 'mysterious', 'figure', 'would', 'result', 'untimely', 'think', 'open', 'eye', 'lap', 'beloved', 'idol', 'gorou', 'find', 'reborn', 'aquamarine', 'newborn', 'son', 'world', 'turn', 'upside', 'gorou', 'soon', 'learn', 'world', 'showbiz', 'pave', 'thorn', 'talent', 'always', 'beget', 'success', 'manage', 'protect', 'ai', 'smile', 'love', 'much', 'help', 'eccentric', 'unexpected', 'ally', 'write', 'mal', 'rewrite', 'drama', 'reincarnation', 'seinen', 'showbiz', 'supernatural']


In [27]:
# combine all tokens into one string
df['cleaned_string'] = df['cleaned_tokens'].apply(lambda x: ' '.join(x))
print(df['cleaned_string'][0])

ai hoshino talented beautiful idol adore fan personification pure young maiden glitter gold gorou amemiya countryside gynecologist big fan ai pregnant idol show hospital beyond bewilder gorou promise safe delivery little know encounter mysterious figure would result untimely think open eye lap beloved idol gorou find reborn aquamarine newborn son world turn upside gorou soon learn world showbiz pave thorn talent always beget success manage protect ai smile love much help eccentric unexpected ally write mal rewrite drama reincarnation seinen showbiz supernatural


In [47]:
df.columns.to_list()

['id',
 'title',
 'start_date',
 'synopsis',
 'mean',
 'rank',
 'popularity',
 'num_list_users',
 'num_scoring_users',
 'nsfw',
 'created_at',
 'updated_at',
 'media_type',
 'status',
 'genres',
 'num_episodes',
 'source',
 'average_episode_duration',
 'rating',
 'studios',
 'medium_picture_url',
 'large_picture_url',
 'alternative_titles',
 'english_title',
 'japanese_title',
 'year',
 'season',
 'end_date',
 'summary',
 'tokens',
 'cleaned_tokens',
 'cleaned_string']

In [48]:
# export the cleaned data
df = df[['id', 'title', 'start_date', 'synopsis', 'mean', 'nsfw', 'created_at', 'updated_at', 'media_type', 'status', 'genres',
         'num_episodes', 'source', 'average_episode_duration', 'rating', 'studios', 'medium_picture_url', 'alternative_titles',
        'year', 'season', 'end_date', 'cleaned_string']]
# df.to_csv('data/cleaned_anime_data.csv', index=False)

# Generate Vectors using TF-IDF

In [49]:
cleaned_df = pd.read_csv('data/cleaned_anime_data.csv')
cleaned_df.head()

Unnamed: 0,id,title,start_date,synopsis,mean,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,medium_picture_url,alternative_titles,year,season,end_date,cleaned_string
0,52034,Oshi no Ko,2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.32,white,2022-06-09 13:01:38+00:00,2023-04-14 04:49:00+00:00,tv,currently_airing,Drama Reincarnation Seinen Showbiz...,11,manga,0,pg_13,Doga Kobo,https://api-cdn.myanimelist.net/images/anime/1...,My Star,2023,spring,,ai hoshino talented beautiful idol adore fan p...
1,5114,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,Action Adventure Drama Fantasy ...,64,manga,1460,r,Bones,https://api-cdn.myanimelist.net/images/anime/1...,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",2009,spring,2010-07-04,horrific alchemy experiment go wrong elric hou...
2,51535,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,Action Drama Gore Military Shou...,2,manga,3690,r,MAPPA,https://api-cdn.myanimelist.net/images/anime/1...,"Shingeki no Kyojin: The Final Season Part 3, S...",2023,winter,2023,wake eren yeager cataclysmic action friend for...
3,9253,Steins;Gate,2011-04-06,Eccentric scientist Rintarou Okabe has a never...,9.08,white,2010-07-26 09:23:40+00:00,2023-04-02 18:08:42+00:00,tv,finished_airing,Drama Psychological Sci-Fi Suspens...,24,visual_novel,1460,pg_13,White Fox,https://api-cdn.myanimelist.net/images/anime/1...,,2011,spring,2011-09-14,eccentric scientist rintarou okabe thirst scie...
4,41467,Bleach: Sennen Kessen-hen,2022-10-11,Substitute Soul Reaper Ichigo Kurosaki spends ...,9.08,white,2020-03-18 09:10:15+00:00,2023-04-02 18:07:55+00:00,tv,finished_airing,Action Adventure Fantasy Shounen,13,manga,1471,r,Pierrot,https://api-cdn.myanimelist.net/images/anime/1...,Bleach: Thousand-Year Blood War Arc,2022,fall,2022-12-27,substitute soul reaper ichigo kurosaki spends ...


In [52]:
# transform string to TD-IDF vector and ignore terms that have a document frequency higher than 80% and lower than 10%
tf_vec = TfidfVectorizer(ngram_range=(1,2), max_df=0.8, min_df=0.1)
tfidf_matrix = tf_vec.fit_transform(cleaned_df['cleaned_string'])
tfidf_array = tfidf_matrix.toarray()

# calculate cos similarity
cos_sim = cosine_similarity(tfidf_array, tfidf_array)
print(cos_sim)

[[1.         0.57633923 0.39488513 ... 0.19747097 0.         0.2088168 ]
 [0.57633923 1.         0.57785455 ... 0.28581169 0.         0.33409505]
 [0.39488513 0.57785455 1.         ... 0.16250386 0.         0.06668619]
 ...
 [0.19747097 0.28581169 0.16250386 ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.20221114]
 [0.2088168  0.33409505 0.06668619 ... 0.         0.20221114 1.        ]]


In [53]:
cos_sim.shape

(12338, 12338)

# Build Recommendation Engine Based on Similarity Score

In [54]:
# create indices for movies
indices = pd.Series(df.index, index=df['title'])
print(indices)

title
Oshi no Ko                                               0
Fullmetal Alchemist: Brotherhood                         1
Shingeki no Kyojin: The Final Season - Kankets...        2
Steins;Gate                                              3
Bleach: Sennen Kessen-hen                                4
                                                     ...  
Hua Jianghu: Bu Liang Ren VI                         12333
Hua Jianghu: Gui Yexing                              12334
Hua Jianghu: Huan Shi Men Sheng 2nd Season           12335
Hua Kaizhang Meili                                   12336
Huan Bian Jingling: Dangao Tianxin                   12337
Length: 12338, dtype: int64


In [55]:
df.columns

Index(['id', 'title', 'start_date', 'synopsis', 'mean', 'nsfw', 'created_at',
       'updated_at', 'media_type', 'status', 'genres', 'num_episodes',
       'source', 'average_episode_duration', 'rating', 'studios',
       'medium_picture_url', 'alternative_titles', 'year', 'season',
       'end_date', 'cleaned_string'],
      dtype='object')

In [58]:
# recommendation engine based on cosine similarity
def get_recommendations(title, cosine_sim, indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_20 = sim_scores[1:21]
    movie_indices = [i[0] for i in top_20]
    
    # Return the top 20 most similar animes
    results = df[['title', 'alternative_titles', 'medium_picture_url', 'synopsis', 'mean', 'year', 'status']].iloc[movie_indices]
    
    return results

In [61]:
# example
get_recommendations('Steins;Gate', cos_sim, indices)

Unnamed: 0,title,alternative_titles,medium_picture_url,synopsis,mean,year,status
343,Higurashi no Naku Koro ni Kai,"Higurashi no Naku Koro ni 2, Higurashi no Naku...",https://api-cdn.myanimelist.net/images/anime/1...,"In the small village of Hinamizawa, Rika Furud...",8.18,2007,finished_airing
971,Shiguang Dailiren Fan Wai Pian: Biwu Zhaoqin,Shiguang Dailiren Episode 5.5,https://api-cdn.myanimelist.net/images/anime/1...,"At the Time Photo Studio, partners Lu Guang an...",7.73,2021,finished_airing
272,Steins;Gate: Kyoukaimenjou no Missing Link - D...,"Steins Gate: Episode 23 (β), Open the Missing ...",https://api-cdn.myanimelist.net/images/anime/7...,"Having reached his emotional breaking point, R...",8.25,2015,finished_airing
567,Eve no Jikan (Movie),"Eves Time, Eve no Jikan 1st Season Complete Ed...",https://api-cdn.myanimelist.net/images/anime/9...,"In the Japan of the future, employing androids...",7.98,2010,finished_airing
2785,Star Driver: Kagayaki no Takuto,STAR DRIVER: Shining Takuto,https://api-cdn.myanimelist.net/images/anime/4...,Deep beneath the surface of Southern Cross Isl...,7.19,2010,finished_airing
2886,ACCA: 13-ku Kansatsu-ka - Regards,"ACCA: 13-Territory Inspection Dept. Regards, A...",https://api-cdn.myanimelist.net/images/anime/1...,The new order of ACCA will soon hold its first...,7.16,2020,finished_airing
103,Steins;Gate 0,"Steins, Gate Zero",https://api-cdn.myanimelist.net/images/anime/1...,"The eccentric, self-proclaimed mad scientist R...",8.53,2018,finished_airing
2970,Koi to Yobu ni wa Kimochi Warui,Its Too Sick to Call this Love,https://api-cdn.myanimelist.net/images/anime/1...,People fall in love in the most mysterious of ...,7.14,2021,finished_airing
2829,Kujira no Kora wa Sajou ni Utau,"Whale Calves Sing on the Sand, Tales of the Wa...",https://api-cdn.myanimelist.net/images/anime/4...,"In a world covered by an endless sea of sand, ...",7.17,2017,finished_airing
7503,Hangyakusei Million Arthur,,https://api-cdn.myanimelist.net/images/anime/1...,Tales of old speak of the legends of Excalibur...,6.01,2018,finished_airing
