In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval as le
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
import spacy
nlp = spacy.load('en_core_web_sm')
from sklearn.metrics.pairwise import cosine_similarity
import pickle

import warnings as w
w.filterwarnings('ignore')

In [2]:
df = pd.read_csv('anime_data_cleaned.csv')
df.head()

Unnamed: 0,anime_id,urls,poster,title,overview,score,views,studio,producer,genres,word_count
0,0,https://sanji.to/bungaku-shoujo-kyou-no-oyatsu...,https://img.zorores.com/_r/300x400/100/59/ce/5...,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",short episode bundled with the limited edition...,6.92,757,['ProductionI.G'],"['Lantis', 'Pony Canyon', 'Enterbrain', 'Kadok...","['Comedy', 'Fantasy', 'School']",90
1,1,https://sanji.to/bungaku-shoujo-memoire-2108,https://img.zorores.com/_r/300x400/100/ec/bd/e...,"""Bungaku Shoujo"" Memoire",episodes which depict the background stories o...,7.35,1284,['ProductionI.G'],"['Lantis', 'Pony Canyon', 'Enterbrain', 'Kadok...","['Drama', 'Romance', 'School']",88
2,2,https://sanji.to/bungaku-shoujo-movie-1802,https://img.zorores.com/_r/300x400/100/5c/85/5...,"""Bungaku Shoujo"" Movie",the protagonist of the story konoha inoue is a...,7.43,1726,['ProductionI.G'],"['Lantis', 'Pony Canyon', 'Enterbrain', 'Kadok...","['Mystery', 'Drama', 'Romance', 'School']",187
3,3,https://sanji.to/my-star-18330,https://img.zorores.com/_r/300x400/100/99/59/9...,My Star,sixteen year old ai hoshino is a talented and ...,9.33,4450861,['DogaKobo'],"['Kadokawa', 'Shueisha']","['Drama', 'Music', 'Supernatural', 'Seinen']",224
4,4,https://sanji.to/hackgu-returner-4688,https://img.zorores.com/_r/300x400/100/e1/ff/e...,.hack//G.U. Returner,the characters from previous hack g u games an...,6.73,270,['BeeTrain'],"['Bandai Visual', 'CyberConnect2', 'Bee Train']","['Adventure', 'Drama', 'Fantasy', 'Game', 'Mag...",146


In [3]:
# Converting studio adn generes to list
df['genres'] = df['genres'].apply(le)
df['studio'] = df['studio'].apply(le)
df.overview = df.overview.astype(str)
df.overview = df.overview.apply(lambda x: x.split())

In [4]:
# Making tags as a combination of overview, genres and studio

df['tags'] = df['overview'] + df['genres'] + df['studio']

In [5]:
new_df = df[['anime_id','title','tags']]
new_df['tags'] = new_df['tags'].apply(lambda x:[i.lower() for i in x])
new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))

In [6]:
new_df.to_csv('anime_rec.csv', index=False)

In [8]:
new_df.head()

Unnamed: 0,anime_id,title,tags
0,0,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",short episode bundled with the limited edition...
1,1,"""Bungaku Shoujo"" Memoire",episodes which depict the background stories o...
2,2,"""Bungaku Shoujo"" Movie",the protagonist of the story konoha inoue is a...
3,3,My Star,sixteen year old ai hoshino is a talented and ...
4,4,.hack//G.U. Returner,the characters from previous hack g u games an...


In [9]:
# Stemming the tags
sw = nlp.Defaults.stop_words
sw.remove('no')

def stemmer(text):
    doc = text.split()
    doc = [i for i in doc if i not in sw]
    doc = [PorterStemmer().stem(i) for i in doc]    
    return ' '.join(doc)

In [10]:
new_df['tags'] = new_df['tags'].apply(stemmer)

In [11]:
new_df.head()

Unnamed: 0,anime_id,title,tags
0,0,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",short episod bundl limit edit releas bungaku s...
1,1,"""Bungaku Shoujo"" Memoire",episod depict background stori bungaku shoujo ...
2,2,"""Bungaku Shoujo"" Movie",protagonist stori konoha inou seemingli normal...
3,3,My Star,sixteen year old ai hoshino talent beauti idol...
4,4,.hack//G.U. Returner,charact previou hack g u game hack root receiv...


In [18]:
# CountVectorizer to convert tags into matrix

cv = CountVectorizer(max_df=0.8, max_features=10000, ngram_range=(1,3))
vector = cv.fit_transform(new_df['tags']).toarray()
print(vector.shape)

(5447, 10000)


In [19]:
# Cosine similarity

similarity = cosine_similarity(vector)
similarity[0]

array([1.        , 0.23094011, 0.0860663 , ..., 0.02024441, 0.06098367,
       0.0745356 ])

In [20]:
def recommend(anime):
    index = new_df[new_df['title'] == anime].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])[1:11]
    for i in distances:
        print(new_df.iloc[i[0]].title)
        pass
    
recommend('Naruto')

Naruto: Shippuden
Boruto: Naruto Next Generations
Naruto OVA2: The Lost Story - Mission: Protect the Waterfall Village
Naruto: Shippuuden Movie 4 - The Lost Tower
Boruto: Naruto the Movie
Naruto: Shippuuden Movie 5 - Blood Prison
Naruto: Shippuuden Movie 6: Road to Ninja
Naruto Shippuuden Movie 3: Inheritors of Will of Fire
Naruto OVA7: Chunin Exam on Fire! and Naruto vs. Konohamaru!
Boruto: Naruto the Movie - The Day Naruto Became the Hokage


In [21]:
# Saving the similarity matrix as pickle file
pickle.dump(similarity, open('similarity_matrix.pkl', 'wb'))