# **ANIME RECOMMENDER**

## Loading the Libraries

In [22]:
import pandas as pd
import numpy as np
from collections import Counter
import random
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

## Loading the Dataset

In [23]:
anime = pd.read_csv('Anime_Dataset.csv')

In [24]:
anime.head()

Unnamed: 0,name,Id,jname,pganime,quality,epsub,epdub,totalep,formats,duration,desc,aired,premired,statusAnime,malscore,genre,studio,producer,animechar,image
0,Jungle no Ouja Taa-chan,jungle-no-ouja-taa-chan-3446,Jungle no Ouja Taa-chan,R+,,1.0,False,50,TV,24m,Ta-chan is the king of jungle. He was abandone...,"Oct 14, 1993 to Sep 29, 1994",Fall 1993,Finished Airing,7.02,"['Action', 'Comedy', 'Parody']",Group TAC,"['TV Tokyo', 'Group TAC']","[{'name': 'JaneNarahashi, Miki', 'voice': 'Nar...",https://img.flawlessfiles.com/_r/300x400/100/f...
1,Majestic Prince Episode 25 – Wings to the Future,majestic-prince-episode-25-wings-to-the-future...,Ginga Kikoutai Majestic Prince: Mirai e no Tsu...,PG-13,,1.0,1,False,Special,23m,Team Fawn from Gurantseere Academy dreams of o...,"Sep 29, 2016",,Finished Airing,6.45,"['Action', 'Sci-Fi', 'Space', 'Mecha', 'School...",Orange,"['Sotsu', 'Orange', 'Seven Arcs Pictures', 'Se...","[{'name': 'Magalanes, YuiAmamiya, Sora', 'voic...",https://img.flawlessfiles.com/_r/300x400/100/b...
2,My Hero Academia Season 2: Hero Notebook,my-hero-academia-season-2-hero-notebook-2208,Boku no Hero Academia 2nd Season: Hero Note,PG-13,,1.0,1,False,Special,23m,Recap of Boku no Hero Academia that aired a we...,"Mar 25, 2017",,Finished Airing,7.33,"['Action', 'Comedy', 'Super Power']",Bones,"['Bones', 'Funimation']","[{'name': 'All MightMiyake, Kenta', 'voice': '...",https://img.flawlessfiles.com/_r/300x400/100/b...
3,Ghost in the Shell: Stand Alone Complex 2nd GI...,ghost-in-the-shell-stand-alone-complex-2nd-gig...,Koukaku Kidoutai: Stand Alone Complex 2nd GIG ...,R,,1.0,1,False,Special,2h 41m,"Compilation movie of the ""Individual Eleven"" s...","Jan 27, 2006",,Finished Airing,8.02,"['Action', 'Mecha', 'Sci-Fi']",Production I.G,"['Production I.G', 'Bandai Visual', 'Tokuma Sh...","[{'name': 'Aramaki, DaisukeRoberts, Russell', ...",https://img.flawlessfiles.com/_r/300x400/100/e...
4,Ghost in the Shell: Stand Alone Complex - The ...,ghost-in-the-shell-stand-alone-complex-the-lau...,Koukaku Kidoutai: Stand Alone Complex - The La...,R,HD,1.0,1,False,Special,2h 40m,"In 2024, the terrorist incident known as ""The ...",,2h 40m,8.1,Production I.G,"['Action', 'Mystery', 'Mecha', 'Sci-Fi', 'Poli...",Production I.G,[],"[{'name': 'Aramaki, DaisukeRoberts, Russell', ...",https://img.flawlessfiles.com/_r/300x400/100/4...


In [25]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6928 entries, 0 to 6927
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         6928 non-null   object 
 1   Id           6928 non-null   object 
 2   jname        6927 non-null   object 
 3   pganime      6899 non-null   object 
 4   quality      6879 non-null   object 
 5   epsub        6906 non-null   float64
 6   epdub        6928 non-null   object 
 7   totalep      6928 non-null   object 
 8   formats      6928 non-null   object 
 9   duration     6928 non-null   object 
 10  desc         6811 non-null   object 
 11  aired        6749 non-null   object 
 12  premired     6301 non-null   object 
 13  statusAnime  6928 non-null   object 
 14  malscore     6865 non-null   object 
 15  genre        6928 non-null   object 
 16  studio       6677 non-null   object 
 17  producer     6928 non-null   object 
 18  animechar    5564 non-null   object 
 19  image 

In [26]:
anime.shape

(6928, 20)

## Data Preprocessing

### Eliminating null values and duplicate entries from the dataset.

In [27]:
# checking for duplicated entries

anime.duplicated().sum()

0

In [28]:
# checking for null values in the dataset

anime.isnull().sum()

Unnamed: 0,0
name,0
Id,0
jname,1
pganime,29
quality,49
epsub,22
epdub,0
totalep,0
formats,0
duration,0


Since we are unable to retrieve the descriptions for certain anime, we will drop any rows that contain null values in the 'description' column.

In [29]:
# Dropping the rows which doesn't have description

anime.dropna(subset=['desc'], inplace = True)

In [30]:
anime.shape

(6811, 20)

In [31]:
# checking for null values in the dataset

anime.isnull().sum()

Unnamed: 0,0
name,0
Id,0
jname,1
pganime,29
quality,49
epsub,21
epdub,0
totalep,0
formats,0
duration,0


---

**Eliminating unnecessary columns.**

In [32]:
irr_cols = ['Id','premired','malscore','animechar']

anime.drop(columns=irr_cols, axis=1, inplace=True)

In [33]:
anime.shape

(6811, 16)

---

**Totalep Column imputation**

In [34]:
anime['totalep'].value_counts()

Unnamed: 0_level_0,count
totalep,Unnamed: 1_level_1
False,1936
12,1607
13,637
26,308
2,227
...,...
115,1
373,1
214,1
358,1


totalep column has 1936 entries as False,\
we will replace it with the epsub values

In [35]:
anime['epsub'].isnull().sum()

21

but epsub has 21 null entries,\
we will replace it with corresponding epdub values

In [36]:
anime.loc[anime['epsub'].isnull(), 'epsub'] = anime.loc[anime['epsub'].isnull(), 'epdub']

 '1' '1' '104' '26']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  anime.loc[anime['epsub'].isnull(), 'epsub'] = anime.loc[anime['epsub'].isnull(), 'epdub']


In [37]:
anime['epsub'].isnull().sum()

0

In [38]:
# replacing False entries of totalep with corresponding epsub values

anime.loc[anime['totalep']=='False', 'totalep'] = anime.loc[anime['totalep']=='False', 'epsub']

---
\\
**pganime colummn imputation**

In [39]:
anime['pganime'].isnull().sum()

29

In [40]:
anime['pganime'].value_counts()

Unnamed: 0_level_0,count
pganime,Unnamed: 1_level_1
PG-13,4251
R,936
G,621
R+,587
PG,381
Rx,6


PG-13 is been used the most, so we will replace NaN entries with PG-13

In [41]:
anime['pganime'].fillna('PG-13', inplace = True)

---
\\
**quality colummn imputation**

In [42]:
anime['quality'].isnull().sum()

49

In [43]:
anime['quality'].value_counts()

Unnamed: 0_level_0,count
quality,Unnamed: 1_level_1
HD,5991
SD,769
CAM,2


HD is been used the most, so we will replace NaN entries with HD

In [44]:
anime['quality'].fillna('HD', inplace = True)

c

In [45]:
anime['aired'][600:620]

Unnamed: 0,aired
609,Spring 2015
610,"Jul 6, 2016 to Sep 21, 2016"
611,Fall 2019
612,"Jul 5, 2021 to ?"
613,"Jul 22, 2009"
614,Winter 1988
615,Summer 2006
616,Fall 2021
617,"Sep 30, 2021"
618,


There are 4 types or format entries in the aired column \\
- Spring 2015
- Apr 11, 2021 to ?
- Oct 21, 1994 to Aug 21, 1996
- Sep 30, 2021

we will convert all entries in the format of Spring 2015

In [46]:
# list to store monts, seasons, year

month = []
season = []
year = []


In [47]:
# function to get season on months

def get_season(month):

    month = month.lower()

    if month in ['dec', 'jan', 'feb']:
        return "Winter"
    elif month in ['mar', 'apr', 'may']:
        return "Spring"
    elif month in ['jun', 'jul', 'aug']:
        return "Summer"
    else:
        return "Fall"

In [48]:
# function to get season and year

def get_aired(aired):

    ele = aired.split(',')

    if len(ele) == 1:       # Spring 2015
        return ele

    if len(ele) == 2:       # Apr 11, 2021 to ?  or  Sep 30, 2021

        if len(ele[-1]) == 4:       # Sep 30, 2021
            month.append(ele[0][:3])  # append Sep
            m = ele[0][:3]

            year.append(ele[-1])        # append 2021
            y = ele[-1]

            s = get_season(m)
            season.append(s)            # append season(Sep) = Fall

        else:               # Apr 11, 2021 to ?
            month.append(ele[0][:3])  # append Apr
            m = ele[0][:3]

            year.append(ele[-1][1:5])        # append 2021
            y = ele[-1][1:5]

            s = get_season(m)
            season.append(s)            # append season(Apr) = Spring

    else:           # Oct 21, 1994 to Aug 21, 1996
        month.append(ele[0][:3])  # append Oct
        m = ele[0][:3]

        year.append(ele[1][1:5])        # append 1994
        y = ele[1][1:5]

        s = get_season(m)
        season.append(s)                # append season(Oct) = Fall


    return [s]+[y]


In [49]:
get_aired('Spring 2015')

['Spring 2015']

In [50]:
get_aired('Apr 11, 2021 to ?')

['Spring', '2021']

In [51]:
get_aired('Oct 21, 1994 to Aug 21, 1996')

['Fall', '1994']

In [52]:
get_aired('Sep 30, 2021')

['Fall', '2021']

Now we can replace all the non NaN entries with [season, year] entries

In [53]:
anime['aired'] = anime['aired'].dropna().apply(get_aired)

In [54]:
anime['aired']

Unnamed: 0,aired
0,"[Fall, 1993]"
1,"[Fall, 2016]"
2,"[Spring, 2017]"
3,"[Winter, 2006]"
4,
...,...
6923,"[Summer, 2011]"
6924,"[Spring, 2020]"
6925,"[Summer, 2007]"
6926,"[Winter, 1998]"


In [57]:
# getting mode of month ,season and year

c_year =Counter(year)
c_season = Counter(season)

In [56]:
c_year.most_common(3)

[('2020', 218), ('2014', 216), ('2015', 214)]

In [58]:
c_season.most_common(3)

[('Spring', 1302), ('Fall', 1163), ('Winter', 1027)]

we will randomly take most common year and season and replace it in place of NaN

In [59]:
ls_year = ['2020','2014','2015']
ls_season = ['Spring','Fall','Winter']

In [61]:
anime['aired'] = anime['aired'].isnull().apply(lambda x: [random.choice(ls_season)] + [random.choice(ls_year)])

In [62]:
anime.isnull().sum()

Unnamed: 0,0
name,0
jname,1
pganime,0
quality,0
epsub,0
epdub,0
totalep,0
formats,0
duration,0
desc,0


---
\\
**Studios column imputation**

In [66]:
anime['studio']

Unnamed: 0,studio
0,Group TAC
1,Orange
2,Bones
3,Production I.G
4,Production I.G
...,...
6923,Sunrise
6924,Connect
6925,Madhouse
6926,Radix


In [67]:
# getting the most common studios

anime['studio'].mode()

Unnamed: 0,studio
0,Toei Animation


we will replace NaN entries with 'Toei Animation'

In [68]:
anime['studio'].fillna('Toei Animation', inplace=True)

---
\\
**Desc column imputation**

In [69]:
anime['desc'][0]

'Ta-chan is the king of jungle. He was abandoned in savanna, and has been raised by a chimpanzee, Etekichi. He has a dearest wife, Jane who used to be a top model in New York, but now she looks... Anyway, to protect animals from poachers, Ta-chan is fighting against them with his disciple, Pedro, and a master of Chinese martial arts, Ryo.\n\nThis anime is based on the manga with the same title; it begins as a comical parody of Tarzan. The first several episodes are comedies, but the rest of the series consists of (rather) serious episodes.\n\n(Source: AnimeNfo)\n                                \n                                        Ta-chan is the king of jungle. He was abandoned in savanna, and has been raised by a chimpanzee, Etekichi. He has a dearest wife, Jane who used to be a top model in New York, but now she looks... Anyway, to protect animals from poachers, Ta-chan is fighting against them with his disciple, Pedro, and a master of Chinese martial arts, Ryo.\n\nThis anime is 

In [70]:
anime['desc'][1]

'Team Fawn from Gurantseere Academy dreams of one day fighting beside the heroes of Team Rabbit. One day, they finally get their chance as the GDF launches another attack.\n\n(Source: Crunchyroll)\n                                \n                                        Team Fawn from Gurantseere Academy dreams of one day fighting beside the heroes of Team Rabbit. One day, they finally get their chance as the GDF launches another attack.\n\n(Source: Crunchyroll)'

As we can see the anime description is repeated twice, \\
we have to remove the second one \\
we we look closely we can see that the paragraphs are seperated by \n\n

In [72]:
anime['desc'][1].split('\n\n')[0]

'Team Fawn from Gurantseere Academy dreams of one day fighting beside the heroes of Team Rabbit. One day, they finally get their chance as the GDF launches another attack.'

In [77]:
anime['desc'][55]

'A squadron of robots defends the planet against alien invaders. \n\n(Source: AniDB)\n                                \n                                        A squadron of robots defends the planet against alien invaders. \n\n(Source: AniDB)'

In [78]:
anime['desc'] = anime['desc'].apply(lambda x: x.split('\n\n')[0])

---
\\
**statusAnime column elimination**

In [84]:
anime['statusAnime'].value_counts()

Unnamed: 0_level_0,count
statusAnime,Unnamed: 1_level_1
Finished Airing,4463
?,405
Currently Airing,65
7.36,19
7.39,18
...,...
4.82,1
6.03,1
9.09,1
5.88,1


as we can see the most of the entries are numeric, but 4 entries are textual
 \\
we will replace them to a numeric values as follows \
- Finished Airing = 10.0
- ? = mean value
- Currently Airing = 4.5
- Not yet Aired = 0

In [88]:
anime['statusAnime'][222]

'Finished Airing'

In [89]:
# getting the rows which contains numeric values

num = anime['statusAnime'].loc[(anime['statusAnime'] != 'Finished Airing') & (anime['statusAnime'] != '?') & (anime['statusAnime'] != 'Currently Airing') & (anime['statusAnime'] != 'Not yet aired')]

In [92]:
# calulating mean

mean = num.apply(lambda x: eval(x)).mean()
mean

6.867890250399574

the mean = 6.87

In [94]:
# replacing the entries

anime['statusAnime'] = anime['statusAnime'].replace({'Finished Airing':'10.0',
                              '?':'6.87',
                              'Currently Airing':'4.5',
                              'Not yet aired':'0'})

---
\\
**Genre column imputation**

In [99]:
type(anime['genre'][0])

str

There are no NaN values present in genre column but it is of sting type

we have to convert it into list

In [101]:
# funtiion to remove spaces and convert it into list

def get_genre(ele):
    ls = []
    for i in ast.literal_eval(ele):
        i = i.replace(' ','')
        ls.append(i)
    return ls

In [102]:
anime['genre'] = anime['genre'].apply(get_genre)

---
\\
**Producer column imputation**

In [103]:
anime['producer'].value_counts()

Unnamed: 0_level_0,count
producer,Unnamed: 1_level_1
[],2707
['Toei Animation'],107
['Nippon Animation'],35
['Sunrise'],34
['TMS Entertainment'],26
...,...
"['Fuji TV', 'Tokyo Movie Shinsha']",1
"['Fuji TV', 'Toho', 'Sony Music Entertainment', 'Doga Kobo', 'Sentai Filmworks']",1
"['Sotsu Music Publishing', 'Gonzo', 'Discotek Media']",1
"['VAP', 'Arms', 'Sentai Filmworks']",1


most of the entries are empty list [] ,which is 39% of the data , so we will remove it

In [105]:
anime.drop(columns = ['producer'], axis = 1, inplace = True)

In [107]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6811 entries, 0 to 6927
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         6811 non-null   object
 1   jname        6810 non-null   object
 2   pganime      6811 non-null   object
 3   quality      6811 non-null   object
 4   epsub        6811 non-null   object
 5   epdub        6811 non-null   object
 6   totalep      6811 non-null   object
 7   formats      6811 non-null   object
 8   duration     6811 non-null   object
 9   desc         6811 non-null   object
 10  aired        6811 non-null   object
 11  statusAnime  6811 non-null   object
 12  genre        6811 non-null   object
 13  studio       6811 non-null   object
 14  image        6811 non-null   object
dtypes: object(15)
memory usage: 1.1+ MB


---
---

### Sving the dataset

In [116]:
anime.drop(columns = ['epsub','epdub'], axis =1, inplace = True)

In [117]:
anime.head()

Unnamed: 0,name,jname,pganime,quality,totalep,formats,duration,desc,aired,statusAnime,genre,studio,image
0,Jungle no Ouja Taa-chan,Jungle no Ouja Taa-chan,R+,HD,50.0,TV,24m,Ta-chan is the king of jungle. He was abandone...,"[Winter, 2014]",10.0,"[Action, Comedy, Parody]",Group TAC,https://img.flawlessfiles.com/_r/300x400/100/f...
1,Majestic Prince Episode 25 – Wings to the Future,Ginga Kikoutai Majestic Prince: Mirai e no Tsu...,PG-13,HD,1.0,Special,23m,Team Fawn from Gurantseere Academy dreams of o...,"[Fall, 2014]",10.0,"[Action, Sci-Fi, Space, Mecha, School, Seinen]",Orange,https://img.flawlessfiles.com/_r/300x400/100/b...
2,My Hero Academia Season 2: Hero Notebook,Boku no Hero Academia 2nd Season: Hero Note,PG-13,HD,1.0,Special,23m,Recap of Boku no Hero Academia that aired a we...,"[Fall, 2015]",10.0,"[Action, Comedy, SuperPower]",Bones,https://img.flawlessfiles.com/_r/300x400/100/b...
3,Ghost in the Shell: Stand Alone Complex 2nd GI...,Koukaku Kidoutai: Stand Alone Complex 2nd GIG ...,R,HD,1.0,Special,2h 41m,"Compilation movie of the ""Individual Eleven"" s...","[Winter, 2015]",10.0,"[Action, Mecha, Sci-Fi]",Production I.G,https://img.flawlessfiles.com/_r/300x400/100/e...
4,Ghost in the Shell: Stand Alone Complex - The ...,Koukaku Kidoutai: Stand Alone Complex - The La...,R,HD,1.0,Special,2h 40m,"In 2024, the terrorist incident known as ""The ...","[Winter, 2020]",8.1,"[Action, Mystery, Mecha, Sci-Fi, Police, Psych...",Production I.G,https://img.flawlessfiles.com/_r/300x400/100/4...


In [121]:
anime.to_csv('anime_df.csv', index=False)

---
---

In [213]:
anime = pd.read_csv('anime_df.csv')

In [214]:
anime.head()

Unnamed: 0,name,jname,pganime,quality,totalep,formats,duration,desc,aired,statusAnime,genre,studio,image
0,Jungle no Ouja Taa-chan,Jungle no Ouja Taa-chan,R+,HD,50.0,TV,24m,Ta-chan is the king of jungle. He was abandone...,"['Winter', '2014']",10.0,"['Action', 'Comedy', 'Parody']",Group TAC,https://img.flawlessfiles.com/_r/300x400/100/f...
1,Majestic Prince Episode 25 – Wings to the Future,Ginga Kikoutai Majestic Prince: Mirai e no Tsu...,PG-13,HD,1.0,Special,23m,Team Fawn from Gurantseere Academy dreams of o...,"['Fall', '2014']",10.0,"['Action', 'Sci-Fi', 'Space', 'Mecha', 'School...",Orange,https://img.flawlessfiles.com/_r/300x400/100/b...
2,My Hero Academia Season 2: Hero Notebook,Boku no Hero Academia 2nd Season: Hero Note,PG-13,HD,1.0,Special,23m,Recap of Boku no Hero Academia that aired a we...,"['Fall', '2015']",10.0,"['Action', 'Comedy', 'SuperPower']",Bones,https://img.flawlessfiles.com/_r/300x400/100/b...
3,Ghost in the Shell: Stand Alone Complex 2nd GI...,Koukaku Kidoutai: Stand Alone Complex 2nd GIG ...,R,HD,1.0,Special,2h 41m,"Compilation movie of the ""Individual Eleven"" s...","['Winter', '2015']",10.0,"['Action', 'Mecha', 'Sci-Fi']",Production I.G,https://img.flawlessfiles.com/_r/300x400/100/e...
4,Ghost in the Shell: Stand Alone Complex - The ...,Koukaku Kidoutai: Stand Alone Complex - The La...,R,HD,1.0,Special,2h 40m,"In 2024, the terrorist incident known as ""The ...","['Winter', '2020']",8.1,"['Action', 'Mystery', 'Mecha', 'Sci-Fi', 'Poli...",Production I.G,https://img.flawlessfiles.com/_r/300x400/100/4...


## Natural Language Processing

In this section we will remove the spaces and convert them into list for merging them later

In [215]:
# removing spaces in studio column

anime['studio'] = anime['studio'].apply(lambda x: x.replace(' ',''))

In [216]:
anime.columns

Index(['name', 'jname', 'pganime', 'quality', 'totalep', 'formats', 'duration',
       'desc', 'aired', 'statusAnime', 'genre', 'studio', 'image'],
      dtype='object')

In [217]:
# converting the columns into list

anime['desc'] = anime['desc'].apply(lambda x: x.split())
anime['pganime'] = anime['pganime'].apply(lambda x: x.split())
anime['quality'] = anime['quality'].apply(lambda x: x.split())
anime['totalep'] = anime['totalep'].astype(str).apply(lambda x: x.split())
anime['formats'] = anime['formats'].apply(lambda x: x.split())
anime['duration'] = anime['duration'].apply(lambda x: x.split())
anime['statusAnime'] = anime['statusAnime'].astype(str).apply(lambda x: x.split())
anime['studio'] = anime['studio'].apply(lambda x: x.split())

In [218]:
anime.head()

Unnamed: 0,name,jname,pganime,quality,totalep,formats,duration,desc,aired,statusAnime,genre,studio,image
0,Jungle no Ouja Taa-chan,Jungle no Ouja Taa-chan,[R+],[HD],[50.0],[TV],[24m],"[Ta-chan, is, the, king, of, jungle., He, was,...","['Winter', '2014']",[10.0],"['Action', 'Comedy', 'Parody']",[GroupTAC],https://img.flawlessfiles.com/_r/300x400/100/f...
1,Majestic Prince Episode 25 – Wings to the Future,Ginga Kikoutai Majestic Prince: Mirai e no Tsu...,[PG-13],[HD],[1.0],[Special],[23m],"[Team, Fawn, from, Gurantseere, Academy, dream...","['Fall', '2014']",[10.0],"['Action', 'Sci-Fi', 'Space', 'Mecha', 'School...",[Orange],https://img.flawlessfiles.com/_r/300x400/100/b...
2,My Hero Academia Season 2: Hero Notebook,Boku no Hero Academia 2nd Season: Hero Note,[PG-13],[HD],[1.0],[Special],[23m],"[Recap, of, Boku, no, Hero, Academia, that, ai...","['Fall', '2015']",[10.0],"['Action', 'Comedy', 'SuperPower']",[Bones],https://img.flawlessfiles.com/_r/300x400/100/b...
3,Ghost in the Shell: Stand Alone Complex 2nd GI...,Koukaku Kidoutai: Stand Alone Complex 2nd GIG ...,[R],[HD],[1.0],[Special],"[2h, 41m]","[Compilation, movie, of, the, ""Individual, Ele...","['Winter', '2015']",[10.0],"['Action', 'Mecha', 'Sci-Fi']",[ProductionI.G],https://img.flawlessfiles.com/_r/300x400/100/e...
4,Ghost in the Shell: Stand Alone Complex - The ...,Koukaku Kidoutai: Stand Alone Complex - The La...,[R],[HD],[1.0],[Special],"[2h, 40m]","[In, 2024,, the, terrorist, incident, known, a...","['Winter', '2020']",[8.1],"['Action', 'Mystery', 'Mecha', 'Sci-Fi', 'Poli...",[ProductionI.G],https://img.flawlessfiles.com/_r/300x400/100/4...


Merging all the columns into a single column

the genre and aired columns are string instead of list,
 \\
so we will convert it to list

In [219]:
# function to convert to list

def to_list(ele):
    ls = []
    for i in ast.literal_eval(ele):
        i = i.replace(' ','')
        ls.append(i)
    return ls

In [220]:
anime['aired'] = anime['aired'].apply(to_list)

In [221]:
anime['genre'] = anime['genre'].apply(to_list)

In [224]:
anime['Tags'] = anime['genre'] + anime['desc'] + anime['studio'] + anime['statusAnime'] + anime['pganime'] + anime['quality'] + anime['formats'] + anime['totalep'] + anime['aired'] + anime['duration']

In [225]:
anime['Tags'].head()

Unnamed: 0,Tags
0,"[Action, Comedy, Parody, Ta-chan, is, the, kin..."
1,"[Action, Mecha, School, Sci-Fi, Seinen, Space,..."
2,"[Action, Comedy, SuperPower, Recap, of, Boku, ..."
3,"[Action, Mecha, Sci-Fi, Compilation, movie, of..."
4,"[Action, Mecha, Mystery, Police, Psychological..."


In [226]:
anime_df = anime[['name','Tags']]

In [227]:
anime_df.head()

Unnamed: 0,name,Tags
0,Jungle no Ouja Taa-chan,"[Action, Comedy, Parody, Ta-chan, is, the, kin..."
1,Majestic Prince Episode 25 – Wings to the Future,"[Action, Mecha, School, Sci-Fi, Seinen, Space,..."
2,My Hero Academia Season 2: Hero Notebook,"[Action, Comedy, SuperPower, Recap, of, Boku, ..."
3,Ghost in the Shell: Stand Alone Complex 2nd GI...,"[Action, Mecha, Sci-Fi, Compilation, movie, of..."
4,Ghost in the Shell: Stand Alone Complex - The ...,"[Action, Mecha, Mystery, Police, Psychological..."


## Stemming the words

In [199]:
ps = PorterStemmer()

we will stem each word in the tags column

In [228]:
# function to stem each word and converting it to lower case

def stem(ele):
    ls = []
    for i in ele:
        ls.append((ps.stem(i)).lower())
    return ' '.join(ls)

In [240]:
anime_df['Tags'] = anime_df['Tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime_df['Tags'] = anime_df['Tags'].apply(stem)


## Model Training

using TfidfVectorizer to get vector

In [233]:
tfv = TfidfVectorizer(stop_words='english')

In [241]:
vectors = tfv.fit_transform(anime_df['Tags']).toarray()

In [235]:
vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [236]:
vectors.shape

(6811, 27067)

we need the distance of each anime(vector) with each other to know how similar they are , we will use cosine_similarity for that

In [237]:
cos_simi = cosine_similarity(vectors)

In [238]:
cos_simi[0]

array([1.        , 0.02126423, 0.00668727, ..., 0.0218958 , 0.01492883,
       0.01163381])

the first anime is similar to itself i.e, 100% and to second 2% and to third 0.6% and so on

In [239]:
cos_simi.shape

(6811, 6811)

## Getting Recommandation

In [247]:
def get_anime(name):

    # index of anime
    anime_idx = anime_df[anime_df['name'] == name].index[0]

    distance = cos_simi[anime_idx]

    anime_ls = sorted(list(enumerate(distance)), reverse=True, key=lambda x: x[1])[1:6]

    for i,s in anime_ls:
        print(anime_df['name'][i])

In [253]:
get_anime(anime_df['name'][3363])

Death Note: Relight
Soul Eater
Persona 3 the Movie 4: Winter of Rebirth
Momo, Girl God of Death
Bleach: Memories in the Rain


# saving the similarity matrix

In [254]:
import pickle as pkl

In [257]:
with open('anime_similarity.pkl','wb') as f:
    pkl.dump(cos_simi,f)