In [1]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download(['punkt', 'stopwords', 'wordnet', 'omw-1.4', 'averaged_perceptron_tagger'])
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.util import bigrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lijhu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lijhu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lijhu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lijhu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lijhu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Import Data

In [2]:
df = pd.read_csv('data/anime_data.csv', parse_dates=['start_date', 'created_at', 'updated_at', 'end_date'])
df.head()

Unnamed: 0,ranking,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,end_date
0,{'rank': 1},"""Oshi no Ko""",2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.33,1,1211,179740,33046,white,2022-06-09 13:01:38+00:00,2023-04-12 22:10:51+00:00,tv,currently_airing,"[{'id': 8, 'name': 'Drama'}, {'id': 72, 'name'...",11,manga,0,r,"[{'id': 95, 'name': 'Doga Kobo'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['My Star'],[Oshi No Ko],【推しの子】,2023.0,spring,wednesday,23:00,NaT
1,{'rank': 2},Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3127618,1987906,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",64,manga,1460,r,"[{'id': 4, 'name': 'Bones'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Hagane no Renkinjutsushi: Fullmetal Alchemis...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009.0,spring,sunday,17:00,2010-07-04
2,{'rank': 3},Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,3,537,385728,124269,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",2,manga,3690,r,"[{'id': 569, 'name': 'MAPPA'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Shingeki no Kyojin: The Final Season Part 3'...,Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023.0,winter,,,2023-01-01
3,{'rank': 4},Steins;Gate,2011-04-06,Eccentric scientist Rintarou Okabe has a never...,9.08,4,13,2406893,1319006,white,2010-07-26 09:23:40+00:00,2023-04-02 18:08:42+00:00,tv,finished_airing,"[{'id': 8, 'name': 'Drama'}, {'id': 40, 'name'...",24,visual_novel,1460,pg_13,"[{'id': 314, 'name': 'White Fox'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,[],Steins;Gate,STEINS;GATE,2011.0,spring,wednesday,02:05,2011-09-14
4,{'rank': 5},Bleach: Sennen Kessen-hen,2022-10-11,Substitute Soul Reaper Ichigo Kurosaki spends ...,9.08,5,494,416835,193664,white,2020-03-18 09:10:15+00:00,2023-04-02 18:07:55+00:00,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",13,manga,1471,r,"[{'id': 1, 'name': 'Pierrot'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Bleach: Thousand-Year Blood War Arc'],Bleach: Thousand-Year Blood War,BLEACH 千年血戦篇,2022.0,fall,tuesday,00:00,2022-12-27


In [3]:
df.shape

(20000, 30)

# Data Cleaning and EDA

In [4]:
# get rid of unnecessary characters in the ranking column
df['ranking'] = df['ranking'].apply(lambda x: re.sub('[{}\':]', '', str(x)))
df['ranking'] = df['ranking'].apply(lambda x: str(x).replace('rank', ''))

df.head(3)

Unnamed: 0,ranking,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,end_date
0,1,"""Oshi no Ko""",2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.33,1,1211,179740,33046,white,2022-06-09 13:01:38+00:00,2023-04-12 22:10:51+00:00,tv,currently_airing,"[{'id': 8, 'name': 'Drama'}, {'id': 72, 'name'...",11,manga,0,r,"[{'id': 95, 'name': 'Doga Kobo'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['My Star'],[Oshi No Ko],【推しの子】,2023.0,spring,wednesday,23:00,NaT
1,2,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3127618,1987906,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",64,manga,1460,r,"[{'id': 4, 'name': 'Bones'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Hagane no Renkinjutsushi: Fullmetal Alchemis...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009.0,spring,sunday,17:00,2010-07-04
2,3,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,3,537,385728,124269,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",2,manga,3690,r,"[{'id': 569, 'name': 'MAPPA'}]",https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Shingeki no Kyojin: The Final Season Part 3'...,Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023.0,winter,,,2023-01-01


In [5]:
# get rid of unnecessary characters in the genres and studios column
df['genres'] = df['genres'].apply(lambda x: re.sub('[\[\]{}\'0-9:,]', '', str(x)))
df['genres'] = df['genres'].apply(lambda x: str(x).replace('id', '').replace('name', ''))

df['studios'] = df['studios'].apply(lambda x: re.sub('[\[\]{}\'0-9:,]', '', str(x)))
df['studios'] = df['studios'].apply(lambda x: str(x).replace('id', '').replace('name', ''))

df.head(3)

Unnamed: 0,ranking,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,end_date
0,1,"""Oshi no Ko""",2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.33,1,1211,179740,33046,white,2022-06-09 13:01:38+00:00,2023-04-12 22:10:51+00:00,tv,currently_airing,Drama Reincarnation Seinen Showbiz...,11,manga,0,r,Doga Kobo,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['My Star'],[Oshi No Ko],【推しの子】,2023.0,spring,wednesday,23:00,NaT
1,2,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3127618,1987906,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,Action Adventure Drama Fantasy ...,64,manga,1460,r,Bones,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Hagane no Renkinjutsushi: Fullmetal Alchemis...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009.0,spring,sunday,17:00,2010-07-04
2,3,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,3,537,385728,124269,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,Action Drama Gore Military Shou...,2,manga,3690,r,MAPPA,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Shingeki no Kyojin: The Final Season Part 3'...,Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023.0,winter,,,2023-01-01


In [6]:
df.dtypes

ranking                                     object
title                                       object
start_date                          datetime64[ns]
synopsis                                    object
mean                                       float64
rank                                         int64
popularity                                   int64
num_list_users                               int64
num_scoring_users                            int64
nsfw                                        object
created_at                     datetime64[ns, UTC]
updated_at                     datetime64[ns, UTC]
media_type                                  object
status                                      object
genres                                      object
num_episodes                                 int64
source                                      object
average_episode_duration                     int64
rating                                      object
studios                        

In [7]:
# convert start_season.year data type from float to int
df['start_season.year'] = df['start_season.year'].astype('Int64')

df.head(3)

Unnamed: 0,ranking,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,end_date
0,1,"""Oshi no Ko""",2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.33,1,1211,179740,33046,white,2022-06-09 13:01:38+00:00,2023-04-12 22:10:51+00:00,tv,currently_airing,Drama Reincarnation Seinen Showbiz...,11,manga,0,r,Doga Kobo,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['My Star'],[Oshi No Ko],【推しの子】,2023,spring,wednesday,23:00,NaT
1,2,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3127618,1987906,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,Action Adventure Drama Fantasy ...,64,manga,1460,r,Bones,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Hagane no Renkinjutsushi: Fullmetal Alchemis...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009,spring,sunday,17:00,2010-07-04
2,3,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,3,537,385728,124269,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,Action Drama Gore Military Shou...,2,manga,3690,r,MAPPA,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Shingeki no Kyojin: The Final Season Part 3'...,Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023,winter,,,2023-01-01


In [8]:
# check duplicates
df.duplicated().sum()

0

In [9]:
# check nulls
df.isnull().sum()

ranking                            0
title                              0
start_date                       466
synopsis                        2901
mean                            6048
rank                               0
popularity                         0
num_list_users                     0
num_scoring_users                  0
nsfw                               0
created_at                         0
updated_at                         0
media_type                         0
status                             0
genres                             0
num_episodes                       0
source                          3056
average_episode_duration           0
rating                           290
studios                            0
main_picture.medium               98
main_picture.large                98
alternative_titles.synonyms        0
alternative_titles.en          11296
alternative_titles.ja             65
start_season.year               3172
start_season.season             3172
b

In [10]:
# drop rows where the synopsis/plot is null because it's an important feature for content-based-filtering
df.dropna(subset=['synopsis'], inplace=True)
# drop the broadcast.day_of_the_week and broadcast.start_time columns since a lot of data are missing
df.drop(['broadcast.day_of_the_week', 'broadcast.start_time'], axis=1, inplace=True)

In [11]:
df.shape

(17099, 28)

In [12]:
# check the range of year
print('Minimum year:', df['start_season.year'].min())
print('Maximum year:', df['start_season.year'].max())

Minimum year: 1917
Maximum year: 2023


In [13]:
# filter out anime that aired before 2000 because I don't want to recommend animes that's too old.
df = df[df['start_season.year'] >= 2000]
df.shape

(12335, 28)

In [14]:
df.columns

Index(['ranking', 'title', 'start_date', 'synopsis', 'mean', 'rank',
       'popularity', 'num_list_users', 'num_scoring_users', 'nsfw',
       'created_at', 'updated_at', 'media_type', 'status', 'genres',
       'num_episodes', 'source', 'average_episode_duration', 'rating',
       'studios', 'main_picture.medium', 'main_picture.large',
       'alternative_titles.synonyms', 'alternative_titles.en',
       'alternative_titles.ja', 'start_season.year', 'start_season.season',
       'end_date'],
      dtype='object')

In [15]:
# rename columns
df = df.rename(columns={'main_picture.medium':'medium_picture_url', 'main_picture.large':'large_picture_url',
                        'alternative_titles.synonyms':'alternative_titles','alternative_titles.en':'english_title', 
                        'alternative_titles.ja':'japanese_title', 'start_season.year':'year', 'start_season.season':'season'})

df.head(3)

Unnamed: 0,ranking,title,start_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,medium_picture_url,large_picture_url,alternative_titles,english_title,japanese_title,year,season,end_date
0,1,"""Oshi no Ko""",2023-04-12,Sixteen-year-old Ai Hoshino is a talented and ...,9.33,1,1211,179740,33046,white,2022-06-09 13:01:38+00:00,2023-04-12 22:10:51+00:00,tv,currently_airing,Drama Reincarnation Seinen Showbiz...,11,manga,0,r,Doga Kobo,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['My Star'],[Oshi No Ko],【推しの子】,2023,spring,NaT
1,2,Fullmetal Alchemist: Brotherhood,2009-04-05,After a horrific alchemy experiment goes wrong...,9.11,2,3,3127618,1987906,white,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,tv,finished_airing,Action Adventure Drama Fantasy ...,64,manga,1460,r,Bones,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Hagane no Renkinjutsushi: Fullmetal Alchemis...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009,spring,2010-07-04
2,3,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,In the wake of Eren Yeager's cataclysmic actio...,9.1,3,537,385728,124269,white,2022-04-03 15:34:50+00:00,2023-04-12 14:02:06+00:00,special,currently_airing,Action Drama Gore Military Shou...,2,manga,3690,r,MAPPA,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Shingeki no Kyojin: The Final Season Part 3'...,Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023,winter,2023-01-01


# Text Preprocessing

In [16]:
df.columns

Index(['ranking', 'title', 'start_date', 'synopsis', 'mean', 'rank',
       'popularity', 'num_list_users', 'num_scoring_users', 'nsfw',
       'created_at', 'updated_at', 'media_type', 'status', 'genres',
       'num_episodes', 'source', 'average_episode_duration', 'rating',
       'studios', 'medium_picture_url', 'large_picture_url',
       'alternative_titles', 'english_title', 'japanese_title', 'year',
       'season', 'end_date'],
      dtype='object')

In [17]:
# combine features that I think it's going to be useful
features = ['title', 'synopsis', 'nsfw', 'media_type', 'status', 'genres', 'source', 'rating', 'studios',
            'alternative_titles', 'english_title', 'japanese_title', 'season']

df['summary'] = df[features].apply(lambda x: ', '.join(x.astype(str)), axis=1)

df['summary'][0]

'"Oshi no Ko", Sixteen-year-old Ai Hoshino is a talented and beautiful idol who is adored by her fans. She is the personification of a pure, young maiden. But all that glitters is not gold.\n\nGorou Amemiya is a countryside gynecologist and a big fan of Ai. So when the pregnant idol shows up at his hospital, he is beyond bewildered. Gorou promises her a safe delivery. Little does he know, an encounter with a mysterious figure would result in his untimely death—or so he thought.\n\nOpening his eyes in the lap of his beloved idol, Gorou finds that he has been reborn as Aquamarine Hoshino—Ai\'s newborn son! With his world turned upside down, Gorou soon learns that the world of showbiz is paved with thorns, where talent does not always beget success. Will he manage to protect Ai\'s smile that he loves so much with the help of an eccentric and unexpected ally? \n\n[Written by MAL Rewrite], white, tv, currently_airing,    Drama    Reincarnation    Seinen    Showbiz    Supernatural, manga, r,

In [18]:
# apply lower case and tokenize
df['tokens'] = df['summary'].apply(str.lower).apply(word_tokenize)

print(df['tokens'][0])

['``', 'oshi', 'no', 'ko', "''", ',', 'sixteen-year-old', 'ai', 'hoshino', 'is', 'a', 'talented', 'and', 'beautiful', 'idol', 'who', 'is', 'adored', 'by', 'her', 'fans', '.', 'she', 'is', 'the', 'personification', 'of', 'a', 'pure', ',', 'young', 'maiden', '.', 'but', 'all', 'that', 'glitters', 'is', 'not', 'gold', '.', 'gorou', 'amemiya', 'is', 'a', 'countryside', 'gynecologist', 'and', 'a', 'big', 'fan', 'of', 'ai', '.', 'so', 'when', 'the', 'pregnant', 'idol', 'shows', 'up', 'at', 'his', 'hospital', ',', 'he', 'is', 'beyond', 'bewildered', '.', 'gorou', 'promises', 'her', 'a', 'safe', 'delivery', '.', 'little', 'does', 'he', 'know', ',', 'an', 'encounter', 'with', 'a', 'mysterious', 'figure', 'would', 'result', 'in', 'his', 'untimely', 'death—or', 'so', 'he', 'thought', '.', 'opening', 'his', 'eyes', 'in', 'the', 'lap', 'of', 'his', 'beloved', 'idol', ',', 'gorou', 'finds', 'that', 'he', 'has', 'been', 'reborn', 'as', 'aquamarine', 'hoshino—ai', "'s", 'newborn', 'son', '!', 'with', 

In [19]:
# remove stop words and keep letters only
stop_words = stopwords.words('english')
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words and word.isalpha()])

print(df['tokens'][0])

['oshi', 'ko', 'ai', 'hoshino', 'talented', 'beautiful', 'idol', 'adored', 'fans', 'personification', 'pure', 'young', 'maiden', 'glitters', 'gold', 'gorou', 'amemiya', 'countryside', 'gynecologist', 'big', 'fan', 'ai', 'pregnant', 'idol', 'shows', 'hospital', 'beyond', 'bewildered', 'gorou', 'promises', 'safe', 'delivery', 'little', 'know', 'encounter', 'mysterious', 'figure', 'would', 'result', 'untimely', 'thought', 'opening', 'eyes', 'lap', 'beloved', 'idol', 'gorou', 'finds', 'reborn', 'aquamarine', 'newborn', 'son', 'world', 'turned', 'upside', 'gorou', 'soon', 'learns', 'world', 'showbiz', 'paved', 'thorns', 'talent', 'always', 'beget', 'success', 'manage', 'protect', 'ai', 'smile', 'loves', 'much', 'help', 'eccentric', 'unexpected', 'ally', 'written', 'mal', 'rewrite', 'white', 'tv', 'drama', 'reincarnation', 'seinen', 'showbiz', 'supernatural', 'manga', 'r', 'doga', 'kobo', 'star', 'oshi', 'ko', 'spring']


In [20]:
#  apply tag to each token
df['tokens'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))
print(df['tokens'][0])

[('oshi', 'NN'), ('ko', 'NN'), ('ai', 'VBP'), ('hoshino', 'NN'), ('talented', 'VBN'), ('beautiful', 'JJ'), ('idol', 'NN'), ('adored', 'VBD'), ('fans', 'NNS'), ('personification', 'NN'), ('pure', 'NN'), ('young', 'JJ'), ('maiden', 'NN'), ('glitters', 'NNS'), ('gold', 'VBP'), ('gorou', 'JJ'), ('amemiya', 'JJ'), ('countryside', 'NN'), ('gynecologist', 'NN'), ('big', 'JJ'), ('fan', 'NN'), ('ai', 'NN'), ('pregnant', 'JJ'), ('idol', 'NN'), ('shows', 'NNS'), ('hospital', 'VBP'), ('beyond', 'IN'), ('bewildered', 'VBN'), ('gorou', 'NN'), ('promises', 'NNS'), ('safe', 'JJ'), ('delivery', 'NN'), ('little', 'RB'), ('know', 'JJ'), ('encounter', 'RB'), ('mysterious', 'JJ'), ('figure', 'NN'), ('would', 'MD'), ('result', 'VB'), ('untimely', 'RB'), ('thought', 'VBN'), ('opening', 'VBG'), ('eyes', 'NNS'), ('lap', 'RB'), ('beloved', 'VBD'), ('idol', 'JJ'), ('gorou', 'NN'), ('finds', 'VBZ'), ('reborn', 'JJ'), ('aquamarine', 'JJ'), ('newborn', 'VBN'), ('son', 'NN'), ('world', 'NN'), ('turned', 'VBD'), ('up

In [21]:
# convert tags to the tags that WordNetLemmatizer uses
def convert_pos(pos):
    if pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# lemmatize words with pos-tags
df['cleaned_tokens'] = df['tokens'].apply(lambda x: [WordNetLemmatizer().lemmatize(w[0], pos=convert_pos(w[1])) for w in x])
print(df['cleaned_tokens'][0])

['oshi', 'ko', 'ai', 'hoshino', 'talented', 'beautiful', 'idol', 'adore', 'fan', 'personification', 'pure', 'young', 'maiden', 'glitter', 'gold', 'gorou', 'amemiya', 'countryside', 'gynecologist', 'big', 'fan', 'ai', 'pregnant', 'idol', 'show', 'hospital', 'beyond', 'bewilder', 'gorou', 'promise', 'safe', 'delivery', 'little', 'know', 'encounter', 'mysterious', 'figure', 'would', 'result', 'untimely', 'think', 'open', 'eye', 'lap', 'beloved', 'idol', 'gorou', 'find', 'reborn', 'aquamarine', 'newborn', 'son', 'world', 'turn', 'upside', 'gorou', 'soon', 'learn', 'world', 'showbiz', 'pave', 'thorn', 'talent', 'always', 'beget', 'success', 'manage', 'protect', 'ai', 'smile', 'love', 'much', 'help', 'eccentric', 'unexpected', 'ally', 'write', 'mal', 'rewrite', 'white', 'tv', 'drama', 'reincarnation', 'seinen', 'showbiz', 'supernatural', 'manga', 'r', 'doga', 'kobo', 'star', 'oshi', 'ko', 'spring']


In [22]:
# export the cleaned data
# df.to_csv('data/cleaned_anime_data.csv', index=False)

# Generate Vectors using TF-IDF

In [23]:
# combine all tokens into one string
df['cleaned_string'] = df['cleaned_tokens'].apply(lambda x: ' '.join(x))

# transform string to TD-IDF vector and ignore terms that have a document frequency higher than 80% and lower than 10%
tf_vec = TfidfVectorizer(ngram_range=(1,2), max_df=0.8, min_df=0.1)
tfidf_matrix = tf_vec.fit_transform(df['cleaned_string'])
tfidf_array = tfidf_matrix.toarray()

In [24]:
# calculate cos similarity
cos_sim = cosine_similarity(tfidf_array, tfidf_array)
print(cos_sim)

[[1.         0.66859688 0.39993758 ... 0.19254359 0.07662141 0.23503918]
 [0.66859688 1.         0.5432875  ... 0.24250584 0.06693189 0.32652105]
 [0.39993758 0.5432875  1.         ... 0.09844713 0.12838527 0.04944938]
 ...
 [0.19254359 0.24250584 0.09844713 ... 1.         0.32849063 0.09141126]
 [0.07662141 0.06693189 0.12838527 ... 0.32849063 1.         0.15419648]
 [0.23503918 0.32652105 0.04944938 ... 0.09141126 0.15419648 1.        ]]


In [31]:
cos_sim.shape

(12335, 12335)

In [None]:
# export the cosine_similarity matrix
# np.save('data/cos_sim.npy', arr=cos_sim)

# Build Recommendation Engine Based on Similarity Score

In [27]:
# create indices for movies
indices = pd.Series(df.index, index=df['title'])
print(indices)

title
"Oshi no Ko"                                               0
Fullmetal Alchemist: Brotherhood                           1
Shingeki no Kyojin: The Final Season - Kanketsu-hen        2
Steins;Gate                                                3
Bleach: Sennen Kessen-hen                                  4
                                                       ...  
Hua Jianghu: Bu Liang Ren VI                           19961
Hua Jianghu: Gui Yexing                                19962
Hua Jianghu: Huan Shi Men Sheng 2nd Season             19963
Hua Kaizhang Meili                                     19968
Huan Bian Jingling: Dangao Tianxin                     19972
Length: 12335, dtype: int64


In [28]:
# recommendation engine based on cosine similarity
def get_recommendations(title, cosine_sim, indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_20 = sim_scores[1:20]
    movie_indices = [i[0] for i in top_20]
    
    # Return the top 20 most similar movies
    result = df[['title', 'english_title', 'synopsis', 'year', 'mean', 'medium_picture_url', 'status']].iloc[movie_indices]
    
    return result

In [29]:
# example
get_recommendations('Lycoris Recoil', cos_sim, indices)

Unnamed: 0,title,english_title,synopsis,year,mean,medium_picture_url,status
6475,Cinderella Girls Gekijou,THE IDOLM@STER CINDERELLA GIRLS Theater,The girls of 346 Productions are back in this ...,2017,6.54,https://api-cdn.myanimelist.net/images/anime/1...,finished_airing
85,Seishun Buta Yarou wa Yumemiru Shoujo no Yume ...,Rascal Does Not Dream of a Dreaming Girl,"Six months ago, Sakuta Azusagawa had a chance ...",2019,8.6,https://api-cdn.myanimelist.net/images/anime/1...,finished_airing
13908,Dead Girl Trailer,Dead Girl Trailer,"Once upon a time, a girl was brutally murdered...",2011,3.51,https://api-cdn.myanimelist.net/images/anime/1...,finished_airing
6151,Jigoku Shoujo: Yoi no Togi,Hell Girl: Fourth Twilight,"If you have a grudge against someone, you can ...",2017,6.59,https://api-cdn.myanimelist.net/images/anime/1...,finished_airing
9990,Mahou Shoujo Nante Mou Ii desu kara.,Mahou Shoujo Nante Mouiidesukara,"Yuzuka Hanami is a young, carefree girl who li...",2016,5.92,https://api-cdn.myanimelist.net/images/anime/9...,finished_airing
9769,Kaijuu Girls: Ultra Kaijuu Gijinka Keikaku,KAIJU GIRLS,Very rarely is a girl born with two distinct s...,2016,5.96,https://api-cdn.myanimelist.net/images/anime/4...,finished_airing
10921,Henkei Shoujo,Henkei Shojo,When a gust of wind blows away a young woman's...,2017,5.71,https://api-cdn.myanimelist.net/images/anime/2...,finished_airing
6181,Watashi no Yuri wa Oshigoto desu!,Yuri is My Job!,"Life appears promising for Hime Shiraki, a gir...",2023,6.59,https://api-cdn.myanimelist.net/images/anime/1...,currently_airing
4268,Mahou Shoujo Ikusei Keikaku,Magical Girl Raising Project,"For many girls in N-City, playing the popular ...",2016,6.96,https://api-cdn.myanimelist.net/images/anime/2...,finished_airing
2001,Idoly Pride,Idoly Pride,The VENUS Program is an idol scoring system th...,2021,7.43,https://api-cdn.myanimelist.net/images/anime/1...,finished_airing
