In [1]:
import numpy as np
import pandas as pd
import re
from datetime import datetime
import nltk
nltk.download(['punkt', 'stopwords', 'wordnet', 'omw-1.4', 'averaged_perceptron_tagger'])
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package punkt to /home/jakeli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jakeli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jakeli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jakeli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jakeli/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Import Data

In [2]:
df = pd.read_csv('data/anime_data.csv')
df.head()

Unnamed: 0,id,title,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,num_episodes,source,average_episode_duration,rating,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,genres,studios
0,5114,Fullmetal Alchemist: Brotherhood,2009-04-05,2010-07-04,After a horrific alchemy experiment goes wrong...,9.1,1.0,3,3149847,2003682,white,2008-08-21T03:35:22+00:00,2023-04-02T18:07:03+00:00,tv,finished_airing,64,manga,1460,r,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Hagane no Renkinjutsushi: Fullmetal Alchemis...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009.0,spring,sunday,17:00,"Action, Adventure, Drama, Fantasy, Military, S...",Bones
1,52034,"""Oshi no Ko""",2023-04-12,,Sixteen-year-old Ai Hoshino is a talented and ...,9.08,2.0,525,399851,131985,white,2022-06-09T13:01:38+00:00,2023-04-23T23:12:26+00:00,tv,currently_airing,11,manga,0,pg_13,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['My Star'],[Oshi No Ko],【推しの子】,2023.0,spring,wednesday,23:00,"Drama, Reincarnation, Seinen, Showbiz, Superna...","Doga, Kobo"
2,51535,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,2023,In the wake of Eren Yeager's cataclysmic actio...,9.08,3.0,511,408423,139142,white,2022-04-03T15:34:50+00:00,2023-04-12T14:02:06+00:00,special,currently_airing,2,manga,3690,r,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Shingeki no Kyojin: The Final Season Part 3'...,Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023.0,winter,,,"Action, Drama, Gore, Military, Shounen, Surviv...",MAPPA
3,9253,Steins;Gate,2011-04-06,2011-09-14,Eccentric scientist Rintarou Okabe has a never...,9.08,4.0,13,2419612,1325596,white,2010-07-26T09:23:40+00:00,2023-04-02T18:08:42+00:00,tv,finished_airing,24,visual_novel,1460,pg_13,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,[],Steins;Gate,STEINS;GATE,2011.0,spring,wednesday,02:05,"Drama, Psychological, Sci-Fi, Suspense, Time, ...","White, Fox"
4,41467,Bleach: Sennen Kessen-hen,2022-10-11,2022-12-27,Substitute Soul Reaper Ichigo Kurosaki spends ...,9.07,5.0,483,427532,201511,white,2020-03-18T09:10:15+00:00,2023-04-02T18:07:55+00:00,tv,finished_airing,13,manga,1471,r,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,['Bleach: Thousand-Year Blood War Arc'],Bleach: Thousand-Year Blood War,BLEACH 千年血戦篇,2022.0,fall,tuesday,00:00,"Action, Adventure, Fantasy, Shounen",Pierrot


In [3]:
df.shape

(20000, 30)

# Data Cleaning and EDA

In [4]:
# convert columns to the best possible dtypes using convert_dtypes function
df = df.convert_dtypes()
df.dtypes

id                                      Int64
title                          string[python]
start_date                     string[python]
end_date                       string[python]
synopsis                       string[python]
mean                                  Float64
rank                                    Int64
popularity                              Int64
num_list_users                          Int64
num_scoring_users                       Int64
nsfw                           string[python]
created_at                     string[python]
updated_at                     string[python]
media_type                     string[python]
status                         string[python]
num_episodes                            Int64
source                         string[python]
average_episode_duration                Int64
rating                         string[python]
main_picture.medium            string[python]
main_picture.large             string[python]
alternative_titles.synonyms    str

In [5]:
# remove brackets and quotation marks from titles
df['title'] = df['title'].apply(lambda x: re.sub('[\[\]\'"]', '', x))
df['alternative_titles.synonyms'] = df['alternative_titles.synonyms'].apply(lambda x: re.sub('[\[\]\'"]', '', x))
df['alternative_titles.en'] = df['alternative_titles.en'].apply(lambda x: re.sub('[\[\]\'"]', '', str(x)))
df['alternative_titles.ja'] = df['alternative_titles.ja'].apply(lambda x: re.sub('[\[\]\'"]', '', str(x)))

df.head(3)

Unnamed: 0,id,title,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,num_episodes,source,average_episode_duration,rating,main_picture.medium,main_picture.large,alternative_titles.synonyms,alternative_titles.en,alternative_titles.ja,start_season.year,start_season.season,broadcast.day_of_the_week,broadcast.start_time,genres,studios
0,5114,Fullmetal Alchemist: Brotherhood,2009-04-05,2010-07-04,After a horrific alchemy experiment goes wrong...,9.1,1,3,3149847,2003682,white,2008-08-21T03:35:22+00:00,2023-04-02T18:07:03+00:00,tv,finished_airing,64,manga,1460,r,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009,spring,sunday,17:00,"Action, Adventure, Drama, Fantasy, Military, S...",Bones
1,52034,Oshi no Ko,2023-04-12,,Sixteen-year-old Ai Hoshino is a talented and ...,9.08,2,525,399851,131985,white,2022-06-09T13:01:38+00:00,2023-04-23T23:12:26+00:00,tv,currently_airing,11,manga,0,pg_13,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,My Star,Oshi No Ko,【推しの子】,2023,spring,wednesday,23:00,"Drama, Reincarnation, Seinen, Showbiz, Superna...","Doga, Kobo"
2,51535,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,2023,In the wake of Eren Yeager's cataclysmic actio...,9.08,3,511,408423,139142,white,2022-04-03T15:34:50+00:00,2023-04-12T14:02:06+00:00,special,currently_airing,2,manga,3690,r,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Shingeki no Kyojin: The Final Season Part 3, S...",Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023,winter,,,"Action, Drama, Gore, Military, Shounen, Surviv...",MAPPA


In [6]:
# check duplicates
print('Duplicates:', df.duplicated().sum())

print('ID duplicates:', df.duplicated(subset='id').sum())

Duplicates: 0
ID duplicates: 0


In [7]:
# check nulls
df.isnull().sum()

id                                 0
title                              0
start_date                       664
end_date                        2351
synopsis                        4057
mean                            7393
rank                              11
popularity                         0
num_list_users                     0
num_scoring_users                  0
nsfw                               0
created_at                         0
updated_at                         0
media_type                         0
status                             0
num_episodes                       0
source                          3434
average_episode_duration           0
rating                           349
main_picture.medium              144
main_picture.large               144
alternative_titles.synonyms        0
alternative_titles.en              0
alternative_titles.ja              0
start_season.year               4234
start_season.season             4234
broadcast.day_of_the_week      16660
b

In [8]:
print('(Rows, Columns) before dropping:', df.shape)

# drop rows where the synopsis/plot is null because it's an important feature for content-based-filtering
df.dropna(subset=['synopsis'], inplace=True)
# drop the broadcast.day_of_the_week and broadcast.start_time columns since a lot of data are missing
df.drop(['broadcast.day_of_the_week', 'broadcast.start_time'], axis=1, inplace=True)

print('(Rows, Columns) after dropping:', df.shape)

(Rows, Columns) before dropping: (20000, 30)
(Rows, Columns) after dropping: (15943, 28)


In [9]:
# check the range of year
print('Minimum year:', df['start_season.year'].min())
print('Maximum year:', df['start_season.year'].max())

Minimum year: 1917
Maximum year: 2023


In [10]:
# filter out anime that aired before 2000 because I don't want to recommend animes that's too old.
df = df[df['start_season.year'] >= 2000]
df.shape

(11178, 28)

In [11]:
df.index

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
       19990, 19991, 19992, 19993, 19994, 19995, 19996, 19997, 19998, 19999],
      dtype='int64', length=11178)

In [12]:
# fix index
df.index = pd.RangeIndex(start=0, stop=0+len(df), step=1)
df.index

RangeIndex(start=0, stop=11178, step=1)

In [13]:
# print columns
df.columns.values

array(['id', 'title', 'start_date', 'end_date', 'synopsis', 'mean',
       'rank', 'popularity', 'num_list_users', 'num_scoring_users',
       'nsfw', 'created_at', 'updated_at', 'media_type', 'status',
       'num_episodes', 'source', 'average_episode_duration', 'rating',
       'main_picture.medium', 'main_picture.large',
       'alternative_titles.synonyms', 'alternative_titles.en',
       'alternative_titles.ja', 'start_season.year',
       'start_season.season', 'genres', 'studios'], dtype=object)

In [14]:
# rename columns
df = df.rename(columns={'main_picture.medium':'medium_picture_url', 'main_picture.large':'large_picture_url',
                        'alternative_titles.synonyms':'alternative_title(s)','alternative_titles.en':'english_title', 
                        'alternative_titles.ja':'japanese_title', 'start_season.year':'year', 'start_season.season':'season'})

df.head(3)

Unnamed: 0,id,title,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,num_episodes,source,average_episode_duration,rating,medium_picture_url,large_picture_url,alternative_title(s),english_title,japanese_title,year,season,genres,studios
0,5114,Fullmetal Alchemist: Brotherhood,2009-04-05,2010-07-04,After a horrific alchemy experiment goes wrong...,9.1,1,3,3149847,2003682,white,2008-08-21T03:35:22+00:00,2023-04-02T18:07:03+00:00,tv,finished_airing,64,manga,1460,r,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,2009,spring,"Action, Adventure, Drama, Fantasy, Military, S...",Bones
1,52034,Oshi no Ko,2023-04-12,,Sixteen-year-old Ai Hoshino is a talented and ...,9.08,2,525,399851,131985,white,2022-06-09T13:01:38+00:00,2023-04-23T23:12:26+00:00,tv,currently_airing,11,manga,0,pg_13,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,My Star,Oshi No Ko,【推しの子】,2023,spring,"Drama, Reincarnation, Seinen, Showbiz, Superna...","Doga, Kobo"
2,51535,Shingeki no Kyojin: The Final Season - Kankets...,2023-03-04,2023,In the wake of Eren Yeager's cataclysmic actio...,9.08,3,511,408423,139142,white,2022-04-03T15:34:50+00:00,2023-04-12T14:02:06+00:00,special,currently_airing,2,manga,3690,r,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,"Shingeki no Kyojin: The Final Season Part 3, S...",Attack on Titan: Final Season - The Final Chap...,進撃の巨人 The Final Season完結編,2023,winter,"Action, Drama, Gore, Military, Shounen, Surviv...",MAPPA


# Text Preprocessing

In [15]:
df.columns.values

array(['id', 'title', 'start_date', 'end_date', 'synopsis', 'mean',
       'rank', 'popularity', 'num_list_users', 'num_scoring_users',
       'nsfw', 'created_at', 'updated_at', 'media_type', 'status',
       'num_episodes', 'source', 'average_episode_duration', 'rating',
       'medium_picture_url', 'large_picture_url', 'alternative_title(s)',
       'english_title', 'japanese_title', 'year', 'season', 'genres',
       'studios'], dtype=object)

In [16]:
# select and combine features that I think it's going to be useful for the recommendation engine
features = ['synopsis', 'genres']

df['summary'] = df[features].apply(lambda x: ', '.join(x.astype(str)), axis=1)

print(df['summary'][0])

After a horrific alchemy experiment goes wrong in the Elric household, brothers Edward and Alphonse are left in a catastrophic new reality. Ignoring the alchemical principle banning human transmutation, the boys attempted to bring their recently deceased mother back to life. Instead, they suffered brutal personal loss: Alphonse's body disintegrated while Edward lost a leg and then sacrificed an arm to keep Alphonse's soul in the physical realm by binding it to a hulking suit of armor.

The brothers are rescued by their neighbor Pinako Rockbell and her granddaughter Winry. Known as a bio-mechanical engineering prodigy, Winry creates prosthetic limbs for Edward by utilizing "automail," a tough, versatile metal used in robots and combat armor. After years of training, the Elric brothers set off on a quest to restore their bodies by locating the Philosopher's Stone—a powerful gem that allows an alchemist to defy the traditional laws of Equivalent Exchange.

As Edward becomes an infamous al

In [17]:
# apply tokenize
df['tokens'] = df['summary'].apply(str.lower).apply(word_tokenize)

print(df['tokens'][0])

['after', 'a', 'horrific', 'alchemy', 'experiment', 'goes', 'wrong', 'in', 'the', 'elric', 'household', ',', 'brothers', 'edward', 'and', 'alphonse', 'are', 'left', 'in', 'a', 'catastrophic', 'new', 'reality', '.', 'ignoring', 'the', 'alchemical', 'principle', 'banning', 'human', 'transmutation', ',', 'the', 'boys', 'attempted', 'to', 'bring', 'their', 'recently', 'deceased', 'mother', 'back', 'to', 'life', '.', 'instead', ',', 'they', 'suffered', 'brutal', 'personal', 'loss', ':', 'alphonse', "'s", 'body', 'disintegrated', 'while', 'edward', 'lost', 'a', 'leg', 'and', 'then', 'sacrificed', 'an', 'arm', 'to', 'keep', 'alphonse', "'s", 'soul', 'in', 'the', 'physical', 'realm', 'by', 'binding', 'it', 'to', 'a', 'hulking', 'suit', 'of', 'armor', '.', 'the', 'brothers', 'are', 'rescued', 'by', 'their', 'neighbor', 'pinako', 'rockbell', 'and', 'her', 'granddaughter', 'winry', '.', 'known', 'as', 'a', 'bio-mechanical', 'engineering', 'prodigy', ',', 'winry', 'creates', 'prosthetic', 'limbs',

In [18]:
# remove stop words and keep letters only
stop_words = stopwords.words('english')
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words and word.isalpha()])

print(df['tokens'][0])

['horrific', 'alchemy', 'experiment', 'goes', 'wrong', 'elric', 'household', 'brothers', 'edward', 'alphonse', 'left', 'catastrophic', 'new', 'reality', 'ignoring', 'alchemical', 'principle', 'banning', 'human', 'transmutation', 'boys', 'attempted', 'bring', 'recently', 'deceased', 'mother', 'back', 'life', 'instead', 'suffered', 'brutal', 'personal', 'loss', 'alphonse', 'body', 'disintegrated', 'edward', 'lost', 'leg', 'sacrificed', 'arm', 'keep', 'alphonse', 'soul', 'physical', 'realm', 'binding', 'hulking', 'suit', 'armor', 'brothers', 'rescued', 'neighbor', 'pinako', 'rockbell', 'granddaughter', 'winry', 'known', 'engineering', 'prodigy', 'winry', 'creates', 'prosthetic', 'limbs', 'edward', 'utilizing', 'automail', 'tough', 'versatile', 'metal', 'used', 'robots', 'combat', 'armor', 'years', 'training', 'elric', 'brothers', 'set', 'quest', 'restore', 'bodies', 'locating', 'philosopher', 'powerful', 'gem', 'allows', 'alchemist', 'defy', 'traditional', 'laws', 'equivalent', 'exchange'

In [19]:
#  apply tag to each token
df['tokens'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))
print(df['tokens'][0])

[('horrific', 'NN'), ('alchemy', 'NN'), ('experiment', 'NN'), ('goes', 'VBZ'), ('wrong', 'JJ'), ('elric', 'JJ'), ('household', 'NN'), ('brothers', 'NNS'), ('edward', 'VBP'), ('alphonse', 'NN'), ('left', 'VBD'), ('catastrophic', 'JJ'), ('new', 'JJ'), ('reality', 'NN'), ('ignoring', 'VBG'), ('alchemical', 'JJ'), ('principle', 'NN'), ('banning', 'VBG'), ('human', 'JJ'), ('transmutation', 'NN'), ('boys', 'NNS'), ('attempted', 'VBD'), ('bring', 'VBG'), ('recently', 'RB'), ('deceased', 'VBN'), ('mother', 'NN'), ('back', 'RB'), ('life', 'NN'), ('instead', 'RB'), ('suffered', 'VBD'), ('brutal', 'JJ'), ('personal', 'JJ'), ('loss', 'NN'), ('alphonse', 'NN'), ('body', 'NN'), ('disintegrated', 'VBD'), ('edward', 'RB'), ('lost', 'VBN'), ('leg', 'NN'), ('sacrificed', 'VBN'), ('arm', 'JJ'), ('keep', 'NN'), ('alphonse', 'NN'), ('soul', 'NN'), ('physical', 'JJ'), ('realm', 'NN'), ('binding', 'VBG'), ('hulking', 'VBG'), ('suit', 'NN'), ('armor', 'NN'), ('brothers', 'NNS'), ('rescued', 'VBD'), ('neighbor

In [20]:
# convert tags to the tags that WordNetLemmatizer uses
def convert_pos(pos):
    if pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# lemmatize words with pos-tags
df['cleaned_tokens'] = df['tokens'].apply(lambda x: [WordNetLemmatizer().lemmatize(w[0], pos=convert_pos(w[1])) for w in x])
print(df['cleaned_tokens'][0])

['horrific', 'alchemy', 'experiment', 'go', 'wrong', 'elric', 'household', 'brother', 'edward', 'alphonse', 'leave', 'catastrophic', 'new', 'reality', 'ignore', 'alchemical', 'principle', 'ban', 'human', 'transmutation', 'boy', 'attempt', 'bring', 'recently', 'decease', 'mother', 'back', 'life', 'instead', 'suffer', 'brutal', 'personal', 'loss', 'alphonse', 'body', 'disintegrate', 'edward', 'lose', 'leg', 'sacrifice', 'arm', 'keep', 'alphonse', 'soul', 'physical', 'realm', 'bind', 'hulk', 'suit', 'armor', 'brother', 'rescue', 'neighbor', 'pinako', 'rockbell', 'granddaughter', 'winry', 'know', 'engineering', 'prodigy', 'winry', 'create', 'prosthetic', 'limb', 'edward', 'utilize', 'automail', 'tough', 'versatile', 'metal', 'use', 'robot', 'combat', 'armor', 'year', 'train', 'elric', 'brother', 'set', 'quest', 'restore', 'body', 'locate', 'philosopher', 'powerful', 'gem', 'allow', 'alchemist', 'defy', 'traditional', 'law', 'equivalent', 'exchange', 'edward', 'become', 'infamous', 'alchemi

In [21]:
# combine all tokens into one string
df['cleaned_string'] = df['cleaned_tokens'].apply(lambda x: ' '.join(x))
print(df['cleaned_string'][0])

horrific alchemy experiment go wrong elric household brother edward alphonse leave catastrophic new reality ignore alchemical principle ban human transmutation boy attempt bring recently decease mother back life instead suffer brutal personal loss alphonse body disintegrate edward lose leg sacrifice arm keep alphonse soul physical realm bind hulk suit armor brother rescue neighbor pinako rockbell granddaughter winry know engineering prodigy winry create prosthetic limb edward utilize automail tough versatile metal use robot combat armor year train elric brother set quest restore body locate philosopher powerful gem allow alchemist defy traditional law equivalent exchange edward become infamous alchemist gain nickname fullmetal boy journey embroil grow conspiracy threatens fate world write mal rewrite action adventure drama fantasy military shounen


In [22]:
df.columns

Index(['id', 'title', 'start_date', 'end_date', 'synopsis', 'mean', 'rank',
       'popularity', 'num_list_users', 'num_scoring_users', 'nsfw',
       'created_at', 'updated_at', 'media_type', 'status', 'num_episodes',
       'source', 'average_episode_duration', 'rating', 'medium_picture_url',
       'large_picture_url', 'alternative_title(s)', 'english_title',
       'japanese_title', 'year', 'season', 'genres', 'studios', 'summary',
       'tokens', 'cleaned_tokens', 'cleaned_string'],
      dtype='object')

In [24]:
# reorder and export the cleaned data
df = df[['id', 'title', 'alternative_title(s)', 'japanese_title', 'synopsis', 'genres', 'start_date', 'end_date', 'year', 'season', 
         'mean', 'rank', 'popularity', 'rating', 'nsfw', 'media_type', 'source', 'status', 'num_episodes', 'average_episode_duration', 
         'studios', 'created_at', 'updated_at', 'medium_picture_url', 'cleaned_string']]
# df.to_csv('data/cleaned_anime_data.csv', index=False)

# Generate Vectors using TF-IDF

In [25]:
cleaned_df = pd.read_csv('data/cleaned_anime_data.csv')
cleaned_df.head()

Unnamed: 0,id,title,alternative_title(s),japanese_title,synopsis,genres,start_date,end_date,year,season,mean,rank,popularity,rating,nsfw,media_type,source,status,num_episodes,average_episode_duration,studios,created_at,updated_at,medium_picture_url,cleaned_string
0,5114,Fullmetal Alchemist: Brotherhood,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",鋼の錬金術師 FULLMETAL ALCHEMIST,After a horrific alchemy experiment goes wrong...,"Action, Adventure, Drama, Fantasy, Military, S...",2009-04-05,2010-07-04,2009,spring,9.1,1.0,3,r,white,tv,manga,finished_airing,64,1460,Bones,2008-08-21T03:35:22+00:00,2023-04-02T18:07:03+00:00,https://api-cdn.myanimelist.net/images/anime/1...,horrific alchemy experiment go wrong elric hou...
1,52034,Oshi no Ko,My Star,【推しの子】,Sixteen-year-old Ai Hoshino is a talented and ...,"Drama, Reincarnation, Seinen, Showbiz, Superna...",2023-04-12,,2023,spring,9.08,2.0,525,pg_13,white,tv,manga,currently_airing,11,0,"Doga, Kobo",2022-06-09T13:01:38+00:00,2023-04-23T23:12:26+00:00,https://api-cdn.myanimelist.net/images/anime/1...,ai hoshino talented beautiful idol adore fan p...
2,51535,Shingeki no Kyojin: The Final Season - Kankets...,"Shingeki no Kyojin: The Final Season Part 3, S...",進撃の巨人 The Final Season完結編,In the wake of Eren Yeager's cataclysmic actio...,"Action, Drama, Gore, Military, Shounen, Surviv...",2023-03-04,2023,2023,winter,9.08,3.0,511,r,white,special,manga,currently_airing,2,3690,MAPPA,2022-04-03T15:34:50+00:00,2023-04-12T14:02:06+00:00,https://api-cdn.myanimelist.net/images/anime/1...,wake eren yeager cataclysmic action friend for...
3,9253,Steins;Gate,,STEINS;GATE,Eccentric scientist Rintarou Okabe has a never...,"Drama, Psychological, Sci-Fi, Suspense, Time, ...",2011-04-06,2011-09-14,2011,spring,9.08,4.0,13,pg_13,white,tv,visual_novel,finished_airing,24,1460,"White, Fox",2010-07-26T09:23:40+00:00,2023-04-02T18:08:42+00:00,https://api-cdn.myanimelist.net/images/anime/1...,eccentric scientist rintarou okabe thirst scie...
4,41467,Bleach: Sennen Kessen-hen,Bleach: Thousand-Year Blood War Arc,BLEACH 千年血戦篇,Substitute Soul Reaper Ichigo Kurosaki spends ...,"Action, Adventure, Fantasy, Shounen",2022-10-11,2022-12-27,2022,fall,9.07,5.0,483,r,white,tv,manga,finished_airing,13,1471,Pierrot,2020-03-18T09:10:15+00:00,2023-04-02T18:07:55+00:00,https://api-cdn.myanimelist.net/images/anime/1...,substitute soul reaper ichigo kurosaki spends ...


In [26]:
# transform string to TD-IDF vector and ignore terms that have a document frequency higher than 80% and lower than 10%
tf_vec = TfidfVectorizer(ngram_range=(1,2), max_df=0.8, min_df=0.1, use_idf=True)
tfidf_matrix = tf_vec.fit_transform(cleaned_df['cleaned_string'])
tfidf_array = tfidf_matrix.toarray()

# calculate cos similarity
cos_sim = cosine_similarity(tfidf_array, tfidf_array)
print(cos_sim)

[[1.         0.53965105 0.57771543 ... 0.40944714 0.         0.        ]
 [0.53965105 1.         0.37013391 ... 0.49042074 0.35614598 0.        ]
 [0.57771543 0.37013391 1.         ... 0.3994336  0.         0.        ]
 ...
 [0.40944714 0.49042074 0.3994336  ... 1.         0.         0.        ]
 [0.         0.35614598 0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [27]:
cos_sim.shape

(11178, 11178)

# Build Recommendation Engine Based on Similarity Score

In [28]:
# create indices for movies
indices = pd.Series(cleaned_df.index, index=cleaned_df['title'])
print(indices)

title
Fullmetal Alchemist: Brotherhood                           0
Oshi no Ko                                                 1
Shingeki no Kyojin: The Final Season - Kanketsu-hen        2
Steins;Gate                                                3
Bleach: Sennen Kessen-hen                                  4
                                                       ...  
Tot Musica                                             11173
Dramaturgy                                             11174
MILGЯAM                                                11175
Inochi no Tabekata                                     11176
Shin Jidai                                             11177
Length: 11178, dtype: int64


In [32]:
# recommendation engine based on cosine similarity
def get_recommendations(title, cosine_sim, indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    first = 1
    last = 21
    top_20 = sim_scores[first:last]
    movie_indices = [i[0] for i in top_20]
    
    # return the top 20 most similar animes that doesn't contain the title
    results = cleaned_df[['title', 'alternative_title(s)', 'synopsis', 'genres', 'studios', 'mean', 'year', 'status', 'medium_picture_url']].iloc[movie_indices]
    
    # check if results contain the sequels, if yes, remove them and add new recommendation until no sequels exists in the top 20 
    filtered_results = results[results['title'].str.contains(title)==False]
    diff = results.shape[0] - filtered_results.shape[0]
    
    if diff == 0:
        return results
    else:
        add_recommendations = pd.DataFrame(columns=['title', 'alternative_title(s)', 'synopsis', 'genres', 'studios', 'mean', 'year', 'status', 'medium_picture_url'])
        while diff > 0:
            new_idx = sim_scores[last:last+diff]
            last = last + diff
            new_movie_idx = [i[0] for i in new_idx]
            new_recommendations = cleaned_df[['title', 'alternative_title(s)', 'synopsis', 'genres', 'studios', 'mean', 'year', 'status', 'medium_picture_url']].iloc[new_movie_idx]
            filtered_new_recommendations = new_recommendations[new_recommendations["title"].str.contains(title)==False]
            add_recommendations = pd.concat([add_recommendations, filtered_new_recommendations])    
            diff = new_recommendations.shape[0] - filtered_new_recommendations.shape[0]

        new_results = pd.concat([filtered_results, add_recommendations])
        return new_results

In [33]:
# example
get_recommendations('Oshi no Ko', cos_sim, indices)

Unnamed: 0,title,alternative_title(s),synopsis,genres,studios,mean,year,status,medium_picture_url
5664,Vatican Kiseki Chousakan,,Fathers Josef Kou Hiraga and Roberto Nicholas ...,"Drama, Mystery, Supernatural",J.C.Staff,6.4,2017,finished_airing,https://api-cdn.myanimelist.net/images/anime/4...
540,Wolfs Rain OVA,,As the world accelerates toward its own destru...,"Adventure, Drama, Psychological, Sci-Fi, Super...",Bones,8.0,2004,finished_airing,https://api-cdn.myanimelist.net/images/anime/1...
2638,Kaijuu no Kodomo,The Sea Monsters Children,Ruka Azumi's ordinary summer vacation revolves...,"Award, Winning, Drama, Mystery, Seinen, Supern...","Studio, °C",7.2,2019,finished_airing,https://api-cdn.myanimelist.net/images/anime/1...
5232,Night Head 2041,,"In the year 2041, World War III has wiped out ...","Drama, Mystery, Psychological, Sci-Fi, Superna...",Shirogumi,6.51,2021,finished_airing,https://api-cdn.myanimelist.net/images/anime/1...
2305,Digimon Adventure tri. 4: Soushitsu,Digimon tri. 4,The Chosen Children have been reunited with th...,"Action, Adventure, Comedy, Drama","Toei, Animation",7.28,2017,finished_airing,https://api-cdn.myanimelist.net/images/anime/1...
2822,Digimon Adventure tri. 5: Kyousei,Digimon tri. 5,"Much to the horror of Meiko Mochizuki, her Dig...","Action, Adventure, Comedy, Drama","Toei, Animation",7.16,2017,finished_airing,https://api-cdn.myanimelist.net/images/anime/1...
8662,Kowabon,,In today's world full of wondrous technologica...,Horror,ILCA,5.22,2015,finished_airing,https://api-cdn.myanimelist.net/images/anime/1...
7322,Dolls Frontline,,After World War III decimated the world's popu...,"Action, Drama, Military, Sci-Fi","Asahi, Production",5.88,2022,finished_airing,https://api-cdn.myanimelist.net/images/anime/1...
521,Evangelion: 1.0 You Are (Not) Alone,"Evangelion Shin Gekijouban: Jo, Rebuild of Eva...","In a post-apocalyptic world, the last remainin...","Action, Award, Winning, Drama, Mecha, Psycholo...",Khara,8.01,2007,finished_airing,https://api-cdn.myanimelist.net/images/anime/7...
637,Kuuchuu Buranko,"Kuchu Buranko, Trapeze, Flying Trapeze",The world of psychology is far from strange to...,"Drama, Medical, Psychological","Toei, Animation",7.92,2009,finished_airing,https://api-cdn.myanimelist.net/images/anime/3...
