In [28]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sb

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

# Other
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [29]:
# Mounting colab drive
# from google.colab import drive
# drive.mount('/content/drive')

In [30]:
# path = '/content/drive/MyDrive/Anime Recommender NLP Based/data/anime_with_synopsis.csv'
path = '../archive/anime_with_synopsis.csv'

In [31]:
df = pd.read_csv(path)
#animelist_df = pd.read_csv('../archive/animelist.csv')

In [32]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [33]:
#animelist_df = animelist_df.groupby('user_id')
#animelist_df.head()

In [34]:
df["sypnopsis"] = df["sypnopsis"].fillna("")

#drop all rows that have a null synopsis
df = df[df["sypnopsis"] != ""]

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16206 entries, 0 to 16213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   MAL_ID     16206 non-null  int64 
 1   Name       16206 non-null  object
 2   Score      16206 non-null  object
 3   Genres     16206 non-null  object
 4   sypnopsis  16206 non-null  object
dtypes: int64(1), object(4)
memory usage: 759.7+ KB


In [36]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
corpus = df['sypnopsis']
tfidf_matrix = tf.fit_transform(corpus)

In [37]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [38]:
top_10_similar_indexes = list(pd.Series(cosine_sim[10]).sort_values(ascending = False).iloc[1:11].index)
for i in top_10_similar_indexes:
    print(df['Name'][i])


Boruto: Naruto Next Generations
Naruto: Shippuuden
Naruto: Shippuuden Movie 6 - Road to Ninja
Naruto: Shippuuden Movie 4 - The Lost Tower
Boruto: Naruto the Movie
Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo! - Konoha no Sato no Dai Undoukai
Naruto: Shippuuden - Shippuu! "Konoha Gakuen" Den
Naruto SD: Rock Lee no Seishun Full-Power Ninden
The Last: Naruto the Movie
Naruto: Shippuuden Movie 5 - Blood Prison


In [39]:
def recommend(title, cosine_sim=cosine_sim):
    recommended_anime = []
    idx = df[df['Name'] == title].index[0]
    top_10_similar_indexes = list(pd.Series(cosine_sim[idx]).sort_values(ascending = False).iloc[1:11].index)
    # top_10_indexes = list(score_series.iloc[1:11].index)
    for i in top_10_similar_indexes:
        recommended_anime.append(list(df['Name'])[i])
    return recommended_anime


In [40]:
recommend('Tengen Toppa Gurren Lagann', cosine_sim)

['Nagisa',
 'Tengen Toppa Gurren Lagann: Ore no Gurren wa Pikka-Pika!!',
 'Tengen Toppa Gurren Lagann: Mitee Mono wa Miteen da!!',
 'Nagi no Asu kara',
 'Blue Remains',
 'Muv-Luv Alternative: Total Eclipse',
 'Geisters: Fractions of the Earth',
 'Shinkai no Kantai: Submarine 707',
 'Yuurei Yashiki',
 'Berserk: Ougon Jidai-hen I - Haou no Tamago']

In [41]:
df[df['Name'] == 'Nagi no Asu kara']

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
6485,16067,Nagi no Asu kara,8.09,"Drama, Fantasy, Romance","ong ago, all humans lived beneath the sea. How..."


# Text Similarity

### Using Roberta Large Model

In [42]:
#!pip install -U sentence-transformers



In [43]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch

In [44]:
model = SentenceTransformer('stsb-roberta-large')

In [45]:
corpus = df['sypnopsis'].tolist()
len(corpus)

16206

In [46]:
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [47]:
corpus = df['sypnopsis'].tolist()
# embeddings = model.encode(corpus, convert_to_tensor=True)
# torch.save(embeddings, 'corpus_embeddings.pt')

In [48]:
corpus_embeddings = torch.load('corpus_embeddings.pt', map_location=torch.device('cpu'))

In [49]:
sentence = 'Naruto Uzumaki'

sentence_embedding = model.encode(sentence, convert_to_tensor=True)

In [50]:
sentence_embedding

tensor([-0.5698, -1.0850,  0.1022,  ..., -1.2083, -0.8890,  0.2651])

In [51]:
top_k = 20
cos_scores = util.pytorch_cos_sim(sentence_embedding, corpus_embeddings)[0]

top_results = np.argpartition(-cos_scores.cpu(), range(top_k))[0:top_k]

print("\n\n======================\n\n")
print("Query:", sentence)
print("\nTop 10 most similar sentences in corpus:")
for idx in top_results[0:top_k]:
    title_name = df[df['sypnopsis'] == corpus[idx].strip()]['Name'].values[0]
    print(title_name, "(Score: %.4f)" % (cos_scores[idx]))
    





Query: Naruto Uzumaki

Top 10 most similar sentences in corpus:
Shin Koihime†Musou: Otome Tairan - Gakuensai da yo! Zenin Shuugou no Koto (Score: 0.6242)
Time Ranger Cesar Boy no Bouken: Roma Teikoku-hen (Score: 0.6239)
Yuuto-kun ga Iku Movie (Score: 0.6220)
Midori no Makibao Compilation OVA (Score: 0.6177)
Yuuki Yuuna wa Yuusha de Aru: Dai Mankai no Shou (Score: 0.6072)
Mashin Eiyuuden Wataru: Soukaizan Eiyuu Densetsu (Score: 0.5904)
Gochuumon wa Usagi Desu ka? Bloom (Score: 0.5857)
Goku Sayonara Zetsubou Sensei (Score: 0.5822)
Goku Sayonara Zetsubou Sensei (Score: 0.5822)
Mushi no Tsubuyaki (Score: 0.5765)
Hana no Asukagumi! 2: Lonely Cats Battle Royale (Score: 0.5753)
Saru Kani Gassen (1927) (Score: 0.5704)
Muddy Water (Score: 0.5691)
Dreams (Score: 0.5683)
Sayonara Gokko (Score: 0.5674)
Minegishi-san wa Ootsu-kun ni Tabesasetai (Score: 0.5649)
Chiisai Aki Mitsuketa (1982) (Score: 0.5638)
Furusato no Gogatsu (Score: 0.5616)
Kazaguruma (1996) (Score: 0.5559)
Pipi Tobenai Hotaru (

# Named Entity Recognition

In [52]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\35191\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\35191\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [53]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\35191\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [54]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\35191\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.


True

In [55]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\35191\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [56]:
def preprocess(sent):
  sent = nltk.word_tokenize(sent)
  sent = nltk.pos_tag(sent)
  return sent

In [57]:
text = df.sample(1)['sypnopsis'].values[0]
sent = preprocess(text)

In [58]:
pattern = 'NP : {<DT>?<JJ>*<NN>}'

In [59]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

In [60]:
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk import ne_chunk
from pprint import pprint
iob_tagged = tree2conlltags(cs)

In [61]:
ne_tree = ne_chunk(pos_tag(word_tokenize(text)))
print(ne_tree)

(S
  own/JJ
  is/VBZ
  attacked/VBN
  by/IN
  a/DT
  deadly/JJ
  creature/NN
  ,/,
  it/PRP
  can/MD
  only/RB
  be/VB
  stopped/VBN
  with/IN
  the/DT
  (ORGANIZATION Dragon/NNP Blade/NNP)
  ./.
  The/DT
  one/CD
  person/NN
  who/WP
  knows/VBZ
  where/WRB
  the/DT
  blade/NN
  is/VBZ
  wo/MD
  n't/RB
  tell/VB
  (PERSON Lang/NNP)
  ,/,
  and/CC
  even/RB
  if/IN
  he/PRP
  did/VBD
  ,/,
  untold/JJ
  peril/NN
  will/MD
  fall/VB
  on/IN
  anyone/NN
  who/WP
  dares/VBZ
  to/TO
  find/VB
  this/DT
  legendary/JJ
  weapon/NN
  ./.
  (/(
  (PERSON Source/NN)
  :/:
  AniDB/NNP
  )/))


#Topic Extraction

# Keyword Extraction

In [62]:
#!pip install spacy



In [63]:
import spacy
from collections import Counter
from string import punctuation
nlp = spacy.load("en_core_web_sm")
def get_hotwords(text,n):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] 
    doc = nlp(text.lower()) 
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.pos_ in pos_tag):
            result.append(token.text)
    output = set(result)
    most_common_list = Counter(output).most_common(n)
    return most_common_list

#output = set(get_hotwords(new_text))
#most_common_list = Counter(output).most_common(10)

most_common_list = get_hotwords(new_text, 10)

for item in most_common_list:
  print(item[0])

learning
score
precision
tag
performance
machine
prediction
correct
match
example


# Text Summarization