In [1]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sb

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

# Other
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
# Mounting colab drive
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# path = '/content/drive/MyDrive/Anime Recommender NLP Based/data/anime_with_synopsis.csv'
path = '../archive/anime_with_synopsis.csv'

In [4]:
df = pd.read_csv(path)
#animelist_df = pd.read_csv('../archive/animelist.csv')

In [5]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [6]:
#animelist_df = animelist_df.groupby('user_id')
#animelist_df.head()

In [7]:
df["sypnopsis"] = df["sypnopsis"].fillna("")

#drop all rows that have a null synopsis
df = df[df["sypnopsis"] != ""]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16206 entries, 0 to 16213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   MAL_ID     16206 non-null  int64 
 1   Name       16206 non-null  object
 2   Score      16206 non-null  object
 3   Genres     16206 non-null  object
 4   sypnopsis  16206 non-null  object
dtypes: int64(1), object(4)
memory usage: 759.7+ KB


In [9]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
corpus = df['sypnopsis']
tfidf_matrix = tf.fit_transform(corpus)

In [10]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
top_10_similar_indexes = list(pd.Series(cosine_sim[10]).sort_values(ascending = False).iloc[1:11].index)
for i in top_10_similar_indexes:
    print(df['Name'][i])


Boruto: Naruto Next Generations
Naruto: Shippuuden
Naruto: Shippuuden Movie 6 - Road to Ninja
Naruto: Shippuuden Movie 4 - The Lost Tower
Boruto: Naruto the Movie
Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo! - Konoha no Sato no Dai Undoukai
Naruto: Shippuuden - Shippuu! "Konoha Gakuen" Den
Naruto SD: Rock Lee no Seishun Full-Power Ninden
The Last: Naruto the Movie
Naruto: Shippuuden Movie 5 - Blood Prison


In [58]:
import spacy
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained Word2Vec model
word2vec_model = Word2Vec.load('path/to/pretrained/word2vec/model')

# Load SpaCy model for text preprocessing
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

def get_recommendations(query):
    # Preprocess query
    query_tokens = preprocess_text(query)

    # Transform query tokens into word embeddings
    query_embeddings = [word2vec_model.wv[token] for token in query_tokens if token in word2vec_model.wv]

    if not query_embeddings:
        return []

    # Get document embeddings
    document_embeddings = [word2vec_model.wv[token] for token in word2vec_model.wv.vocab]

    # Calculate cosine similarities
    cosine_similarities = cosine_similarity(query_embeddings, document_embeddings).flatten()

    # Sort indices based on similarity scores
    related_anime_indices = cosine_similarities.argsort()[::-1]

    # Return recommended anime names
    return [df['Name'].iloc[index] for index in related_anime_indices[:10]]


get_recommendations("Cool animes about pirates")

13560                         One Piece Movie 14: Stampede
2091        Scramble Wars: Tsuppashire! Genom Trophy Rally
1425                                   Peter Pan no Bouken
5997                                     One Piece Film: Z
419                                      One Piece Movie 1
14216                      Afraid To Be Cool / Raise Me Up
3103     Naruto: Shippuuden - Shippuu! "Konoha Gakuen" Den
1103                 One Piece: Mamore! Saigo no Dai Butai
2315                                           Crusher Joe
12445                                    Ku Pao Ying Xiong
Name: Name, dtype: object

In [12]:
def recommend(title, cosine_sim=cosine_sim):
    recommended_anime = []
    idx = df[df['Name'] == title].index[0]
    top_10_similar_indexes = list(pd.Series(cosine_sim[idx]).sort_values(ascending = False).iloc[1:11].index)
    # top_10_indexes = list(score_series.iloc[1:11].index)
    for i in top_10_similar_indexes:
        recommended_anime.append(list(df['Name'])[i])
    return recommended_anime


In [55]:
recommend('Tengen Toppa Gurren Lagann', cosine_sim)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [14]:
df[df['Name'] == 'Nagi no Asu kara']

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
6485,16067,Nagi no Asu kara,8.09,"Drama, Fantasy, Romance","ong ago, all humans lived beneath the sea. How..."


# Text Similarity

### Using Roberta Large Model

In [15]:
#!pip install -U sentence-transformers

In [16]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
model = SentenceTransformer('stsb-roberta-large')

In [18]:
corpus = df['sypnopsis'].tolist()
len(corpus)

16206

In [19]:
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [20]:
corpus = df['sypnopsis'].tolist()
# embeddings = model.encode(corpus, convert_to_tensor=True)
# torch.save(embeddings, 'corpus_embeddings.pt')

In [21]:
corpus_embeddings = torch.load('corpus_embeddings.pt', map_location=torch.device('cpu'))

In [54]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [52]:
sentence = 'pirate'

sentence_embedding = model.encode(sentence, convert_to_tensor=True)

In [41]:
sentence_embedding

tensor([ 0.3177,  1.0460,  0.5271,  ..., -0.2072, -0.3559, -0.8465])

In [53]:
top_k = 10
cos_scores = util.pytorch_cos_sim(sentence_embedding, corpus_embeddings)[0]

top_results = np.argpartition(-cos_scores.cpu(), range(top_k))[0:top_k]

print("\n\n======================\n\n")
print("Query:", sentence)
print("\nTop 10 most similar sentences in corpus:")
for idx in top_results[0:top_k]:
    title_name = df[df['sypnopsis'] == corpus[idx].strip()]['Name'].values[0]
    print(title_name, "(Score: %.4f)" % (cos_scores[idx]))
    





Query: pirate

Top 10 most similar sentences in corpus:
Dr. Slump Movie 11: Dr. Mashirito & Abale-chan (Score: 0.4534)
Fairy Tail Movie 2: Dragon Cry (Score: 0.4491)
Straw Byururu (Score: 0.4210)
Lupin III: Cagliostro no Shiro (Score: 0.4154)
Ryuuichi Manga Gekijou Onbu Obake (Score: 0.4145)
Nagagutsu wo Haita Neko no Bouken (Score: 0.4142)
Dragon Quest: Abel Yuusha Densetsu (Score: 0.4131)
Dakara Boku wa, H ga Dekinai. (Score: 0.4121)
A New Journey (Score: 0.4099)
Kamishibai Itazura Tanuki no Maki (Score: 0.4077)


# Named Entity Recognition

In [25]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\35191\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\35191\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [26]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\35191\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [27]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\35191\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [28]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\35191\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [29]:
def preprocess(sent):
  sent = nltk.word_tokenize(sent)
  sent = nltk.pos_tag(sent)
  return sent

In [30]:
text = df.sample(1)['sypnopsis'].values[0]
sent = preprocess(text)

In [31]:
pattern = 'NP : {<DT>?<JJ>*<NN>}'

In [32]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

In [33]:
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk import ne_chunk
from pprint import pprint
iob_tagged = tree2conlltags(cs)

In [34]:
ne_tree = ne_chunk(pos_tag(word_tokenize(text)))
print(ne_tree)

(S
  monkey/NN
  tricks/VBZ
  a/DT
  crab/NN
  and/CC
  steals/NNS
  his/PRP$
  food/NN
  ./.)


#Topic Extraction

# Keyword Extraction

In [35]:
#!pip install spacy

In [36]:
import spacy
from collections import Counter
from string import punctuation
nlp = spacy.load("en_core_web_sm")
def get_hotwords(text,n):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] 
    doc = nlp(text.lower()) 
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.pos_ in pos_tag):
            result.append(token.text)
    output = set(result)
    most_common_list = Counter(output).most_common(n)
    return most_common_list

#output = set(get_hotwords(new_text))
#most_common_list = Counter(output).most_common(10)

# most_common_list = get_hotwords(new_text, 10)

# for item in most_common_list:
#   print(item[0])

# Text Summarization