In [9]:
import pandas as pd
import numpy as np
from ast import literal_eval as le
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import spacy
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity
import pickle

import warnings as w
w.filterwarnings('ignore')

In [3]:
df = pd.read_csv('anime_data_cleaned.csv')
df.head()

Unnamed: 0,anime_id,urls,poster,title,overview,score,studio,producer,genres,word_count
0,0,https://aniwatch.to/bungaku-shoujo-kyou-no-oya...,https://img.flawlessfiles.com/_r/300x400/100/5...,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",short episode bundled with the limited edition...,6.92,['ProductionI.G'],"['Lantis', 'Pony Canyon', 'Enterbrain', 'Kadok...","['Comedy', 'Fantasy', 'School']",90
1,1,https://aniwatch.to/bungaku-shoujo-memoire-2108,https://img.flawlessfiles.com/_r/300x400/100/e...,"""Bungaku Shoujo"" Memoire",episodes which depict the background stories o...,7.35,['ProductionI.G'],"['Lantis', 'Pony Canyon', 'Enterbrain', 'Kadok...","['Drama', 'Romance', 'School']",88
2,2,https://aniwatch.to/bungaku-shoujo-movie-1802,https://img.flawlessfiles.com/_r/300x400/100/5...,"""Bungaku Shoujo"" Movie",the protagonist of the story konoha inoue is a...,7.43,['ProductionI.G'],"['Lantis', 'Pony Canyon', 'Enterbrain', 'Kadok...","['Mystery', 'Drama', 'Romance', 'School']",187
3,3,https://aniwatch.to/my-star-18330,https://img.flawlessfiles.com/_r/300x400/100/9...,My Star,in the entertainment world celebrities often s...,9.33,['DogaKobo'],"['Kadokawa', 'Shueisha']","['Drama', 'Music', 'Supernatural', 'Seinen']",218
4,4,https://aniwatch.to/hackgu-returner-4688,https://img.flawlessfiles.com/_r/300x400/100/e...,.hack//G.U. Returner,the characters from previous hack g u games an...,6.73,['BeeTrain'],"['Bandai Visual', 'CyberConnect2', 'Bee Train']","['Adventure', 'Drama', 'Fantasy', 'Game', 'Mag...",146


In [4]:
# Converting studio adn generes to list
df['genres'] = df['genres'].apply(le)
df['studio'] = df['studio'].apply(le)
df.overview = df.overview.astype(str)
df.overview = df.overview.apply(lambda x: x.split())

In [5]:
# Making tags as a combination of overview, genres and studio

df['tags'] = df['overview'] + df['genres'] + df['studio']

In [6]:
new_df = df[['anime_id','title','tags']]
new_df['tags'] = new_df['tags'].apply(lambda x:[i.lower() for i in x])
new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))

In [7]:
new_df.to_csv('anime_rec.csv', index=False)

In [8]:
new_df.head()

Unnamed: 0,anime_id,title,tags
0,0,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",short episode bundled with the limited edition...
1,1,"""Bungaku Shoujo"" Memoire",episodes which depict the background stories o...
2,2,"""Bungaku Shoujo"" Movie",the protagonist of the story konoha inoue is a...
3,3,My Star,in the entertainment world celebrities often s...
4,4,.hack//G.U. Returner,the characters from previous hack g u games an...


In [14]:
# Stemming the tags
sw = nlp.Defaults.stop_words

def stemmer(text):
    doc = text.split()
    doc = [i for i in doc if i not in sw]
    doc = [SnowballStemmer(language='english').stem(i) for i in doc]    
    return ' '.join(doc)

In [15]:
new_df['tags'] = new_df['tags'].apply(stemmer)

In [16]:
new_df.head()

Unnamed: 0,anime_id,title,tags
0,0,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",short episod bundl limit edit releas bungaku s...
1,1,"""Bungaku Shoujo"" Memoire",episod depict background stori bungaku shoujo ...
2,2,"""Bungaku Shoujo"" Movie",protagonist stori konoha inou seem normal seni...
3,3,My Star,entertain world celebr exagger version public ...
4,4,.hack//G.U. Returner,charact previous hack g u game hack root recei...


In [17]:
# CountVectorizer to convert tags into matrix

vectorizer = TfidfVectorizer(max_features=10000,ngram_range=(1,3),)
vector = vectorizer.fit_transform(new_df['tags']).toarray()
print(vector.shape)

(5578, 10000)


In [18]:
# Cosine similarity

similarity = cosine_similarity(vector)
similarity[0]

array([1.        , 0.1397442 , 0.03061966, ..., 0.00205087, 0.01379438,
       0.04945737])

In [19]:
def recommend(anime):
    index = new_df[new_df['title'] == anime].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])[1:11]
    for i in distances:
        print(new_df.iloc[i[0]].title)
        pass
    
recommend('Naruto')

Naruto: Shippuden
Naruto: Shippuuden Movie 4 - The Lost Tower
Boruto: Naruto Next Generations
Naruto: Shippuuden Movie 6: Road to Ninja
Naruto Shippuuden Movie 3: Inheritors of Will of Fire
Boruto: Naruto the Movie
Naruto: Shippuuden Movie 5 - Blood Prison
Naruto OVA2: The Lost Story - Mission: Protect the Waterfall Village
Naruto: Shippuden the Movie 2 -Bonds-
Boruto: Naruto the Movie - The Day Naruto Became the Hokage


In [20]:
# Saving the similarity matrix as pickle file
pickle.dump(similarity, open(r'D:\Github\Anime-Recommender\similarity_matrix.pkl','wb'))