In [31]:
import pandas as pd
import numpy as np
from ast import literal_eval as le
from sklearn.feature_extraction.text import CountVectorizer
import spacy
nlp = spacy.load('en_core_web_sm')
from sklearn.metrics.pairwise import cosine_similarity
import pickle

import warnings as w
w.filterwarnings('ignore')

In [5]:
df = pd.read_csv('anime_data_cleaned.csv')
df.head()

Unnamed: 0,anime_id,urls,poster,title,overview,score,views,studio,producer,genres
0,0,https://sanji.to/bungaku-shoujo-kyou-no-oyatsu...,https://img.zorores.com/_r/300x400/100/59/ce/5...,Bungaku Shoujo Kyou no Oyatsu Hatsukoi,Short episode bundled with the limited editio...,6.92,757,['ProductionI.G'],"['Lantis', 'Pony Canyon', 'Enterbrain', 'Kadok...","['Comedy', 'Fantasy', 'School']"
1,1,https://sanji.to/bungaku-shoujo-memoire-2108,https://img.zorores.com/_r/300x400/100/ec/bd/e...,Bungaku Shoujo Memoire,Episodes which depict the background stories ...,7.35,1284,['ProductionI.G'],"['Lantis', 'Pony Canyon', 'Enterbrain', 'Kadok...","['Drama', 'Romance', 'School']"
2,2,https://sanji.to/bungaku-shoujo-movie-1802,https://img.zorores.com/_r/300x400/100/5c/85/5...,Bungaku Shoujo Movie,The protagonist of the story Konoha Inoue is ...,7.43,1726,['ProductionI.G'],"['Lantis', 'Pony Canyon', 'Enterbrain', 'Kadok...","['Mystery', 'Drama', 'Romance', 'School']"
3,3,https://sanji.to/my-star-18330,https://img.zorores.com/_r/300x400/100/99/59/9...,My Star,Sixteen year old Ai Hoshino is a talented and...,9.33,4450861,['DogaKobo'],"['Kadokawa', 'Shueisha']","['Drama', 'Music', 'Supernatural', 'Seinen']"
4,4,https://sanji.to/hackgu-returner-4688,https://img.zorores.com/_r/300x400/100/e1/ff/e...,hack G U Returner,The characters from previous hack G U Games a...,6.73,270,['BeeTrain'],"['Bandai Visual', 'CyberConnect2', 'Bee Train']","['Adventure', 'Drama', 'Fantasy', 'Game', 'Mag..."


In [6]:
# Converting studio adn generes to list
df['genres'] = df['genres'].apply(le)
df['studio'] = df['studio'].apply(le)
df.overview = df.overview.astype(str)
df.overview = df.overview.apply(lambda x: x.split())

In [7]:
# Making tags as a combination of overview, genres and studio

df['tags'] = df['overview'] + df['genres'] + df['studio']

In [8]:
new_df = df[['anime_id','title','tags']]
new_df['tags'] = new_df['tags'].apply(lambda x:[i.lower() for i in x])
new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))

In [10]:
# new_df.to_csv('anime_rec.csv', index=False)

In [11]:
# Let's make a recommendation system

# First we lemmatize the tags using spacy

def lemma(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

new_df['tags'] = new_df['tags'].apply(lemma)

In [23]:
# CountVectorizer to convert tags into matrix using numpy array as its size is too large

cv = CountVectorizer(max_features=5447, stop_words='english')
vector = cv.fit_transform(new_df['tags']).toarray()

In [27]:
# Cosine similarity

similarity = cosine_similarity(vector)
similarity[0]

array([1.        , 0.34815531, 0.12573892, ..., 0.03143473, 0.09379581,
       0.11396058])

In [30]:
def recommend(anime):
    index = new_df[new_df['title'] == anime].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])[1:11]
    for i in distances:
        print(new_df.iloc[i[0]].title)
        pass
    
recommend('Naruto')

Naruto Shippuden
Boruto Naruto Next Generations
Boruto Naruto the Movie
Naruto Shippuuden Movie 4 The Lost Tower
Naruto Shippuuden Movie 5 Blood Prison
Naruto Shippuuden Movie 6 Road to Ninja
Naruto Shippuuden Movie 3 Inheritors of Will of Fire
Naruto Shippuden the Movie 2 Bonds 
Naruto OVA2 The Lost Story Mission Protect the Waterfall Village
Naruto OVA7 Chunin Exam on Fire and Naruto vs Konohamaru 


In [32]:
# Saving the similarity matrix as pickle file
pickle.dump(similarity, open('similarity_matrix.pkl', 'wb'))