In [1]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sb

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

# Other
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../archive/anime_with_synopsis.csv')
animelist_df = pd.read_csv('../archive/animelist.csv')

In [3]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [4]:
animelist_df = animelist_df.groupby('user_id')
animelist_df.head()

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,0,67,9,1,1
1,0,6702,7,1,4
2,0,242,10,1,4
3,0,4898,0,1,1
4,0,21,10,1,0
...,...,...,...,...,...
109224715,353404,6033,10,1,77
109224716,353404,223,9,2,153
109224717,353404,225,8,2,64
109224718,353404,987,4,2,1


In [5]:
df["sypnopsis"] = df["sypnopsis"].fillna("")

#drop all rows that have a null synopsis
df = df[df["sypnopsis"] != ""]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16206 entries, 0 to 16213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   MAL_ID     16206 non-null  int64 
 1   Name       16206 non-null  object
 2   Score      16206 non-null  object
 3   Genres     16206 non-null  object
 4   sypnopsis  16206 non-null  object
dtypes: int64(1), object(4)
memory usage: 759.7+ KB


In [6]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
corpus = df['sypnopsis']
tfidf_matrix = tf.fit_transform(corpus)

In [8]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
top_10_similar_indexes = list(pd.Series(cosine_sim[10]).sort_values(ascending = False).iloc[1:11].index)
for i in top_10_similar_indexes:
    print(df['Name'][i])


Boruto: Naruto Next Generations
Naruto: Shippuuden
Naruto: Shippuuden Movie 6 - Road to Ninja
Naruto: Shippuuden Movie 4 - The Lost Tower
Boruto: Naruto the Movie
Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo! - Konoha no Sato no Dai Undoukai
Naruto: Shippuuden - Shippuu! "Konoha Gakuen" Den
Naruto SD: Rock Lee no Seishun Full-Power Ninden
The Last: Naruto the Movie
Naruto: Shippuuden Movie 5 - Blood Prison


In [11]:
def recommend(title, cosine_sim=cosine_sim):
    recommended_anime = []
    idx = df[df['Name'] == title].index[0]
    top_10_similar_indexes = list(pd.Series(cosine_sim[idx]).sort_values(ascending = False).iloc[1:11].index)
    # top_10_indexes = list(score_series.iloc[1:11].index)
    for i in top_10_similar_indexes:
        recommended_anime.append(list(df['Name'])[i])
    return recommended_anime


In [12]:
recommend('Tengen Toppa Gurren Lagann', cosine_sim)

['Nagisa',
 'Tengen Toppa Gurren Lagann: Ore no Gurren wa Pikka-Pika!!',
 'Tengen Toppa Gurren Lagann: Mitee Mono wa Miteen da!!',
 'Nagi no Asu kara',
 'Blue Remains',
 'Muv-Luv Alternative: Total Eclipse',
 'Geisters: Fractions of the Earth',
 'Shinkai no Kantai: Submarine 707',
 'Yuurei Yashiki',
 'Berserk: Ougon Jidai-hen I - Haou no Tamago']

In [12]:
df[df['Name'] == 'Nagi no Asu kara']

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
6485,16067,Nagi no Asu kara,8.09,"Drama, Fantasy, Romance","ong ago, all humans lived beneath the sea. How..."


# Text Similarity

### Using Roberta Large Model

In [7]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
model = SentenceTransformer('stsb-roberta-large')

In [9]:
corpus = df['sypnopsis'].tolist()
len(corpus)

16206

In [10]:
corpus = df['sypnopsis'].tolist()
embeddings = model.encode(corpus, convert_to_tensor=True)
torch.save(embeddings, 'corpus_embeddings.pt')

In [11]:
corpus_embeddings = torch.load('corpus_embeddings.pt')

In [17]:
sentence = "I want something about pirates"

sentence_embedding = model.encode(sentence, convert_to_tensor=True)

In [18]:
top_k = 5
cos_scores = util.pytorch_cos_sim(sentence_embedding, corpus_embeddings)[0]

top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

print("\n\n======================\n\n")
print("Query:", sentence)
print("\nTop 10 most similar sentences in corpus:")
for idx in top_results[0:top_k]:
    print(corpus[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))
    





Query: I want something about pirates

Top 10 most similar sentences in corpus:
Gol D. Roger was known as the "Pirate King," the strongest and most infamous being to have sailed the Grand Line. The capture and execution of Roger by the World Government brought a change throughout the world. His last words before his death revealed the existence of the greatest treasure in the world, One Piece. It was this revelation that brought about the Grand Age of Pirates, men who dreamed of finding One Piece—which promises an unlimited amount of riches and fame—and quite possibly the pinnacle of glory and the title of the Pirate King. Enter Monkey D. Luffy, a 17-year-old boy who defies your standard definition of a pirate. Rather than the popular persona of a wicked, hardened, toothless pirate ransacking villages for fun, Luffy's reason for being a pirate is one of pure wonder: the thought of an exciting adventure that leads him to intriguing people and ultimately, the promised treasure. Follo