In [1]:
import numpy as np
import pandas as pd
import torch
import requests
import json
import re
import time
import inflect
import nltk
import spacy
import datetime
import tensorflow as tf
import plotly.graph_objects as go
import pickle

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/L

### Global Variables

In [2]:
p = inflect.engine()
base_url = "https://api.jikan.moe/v4/"
tool_kit = spacy.load("en_core_web_sm")

## Helper Functions

In [19]:
def unpacked_title_synonmys(a):
    titles = a[1:-1]
    unpacked = []
    for i in a:
        unpacked.append(i)
    return unpacked


def text_cleaner(text):
    text = re.sub("[^a-zA-Z0-9]", " ", text)
    text = text.strip()
    return text


def handle_english_title(a):
    if pd.isna(a["title_english"]):
        return a["title_default"]
    return a["title_english"]

def handle_title_synonyms(a):
    if pd.isna(a["title_synonyms"]):
        return a["title_default"]
    return a["title_synonyms"]

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

def clean_synopsis(a):
    find = "[Written by MAL Rewrite]"
    if find in a:
        a = a.replace(find, "")
        a = a.strip()
        return a
    else:
        a = a.strip()
        return a

def update_synopsis(a):
    if a["title_english"] == a["title_synonyms"]:
        return a["synopsis"] + " " +a["title_english"]
    else:
        return a["synopsis"] + " " + a["title_synonyms"] + " " + a["title_english"]



def get_embedding(text):
    processed_text = preprocess_text(text)  # Preprocess input
    return model.encode(processed_text, show_progress_bar = False)  # Convert to vector

## Gathering The Data

In [35]:
response = requests.request("GET", base_url+'anime?page=10')   # hitting the URL w.r.t page number
response = json.loads(response.text)
data = response["data"]
# print(data[10]["title_synonyms"])
unpacked_title = []
for i in data[10]["title_synonyms"]:
    unpacked_title.append(i)
print("".join(unpacked_title))
# print(data[10]["genres"])

Happy Lesson OVA


In [None]:
# animes dictionary which will be converted to DataFrame
animes = {
    "mal_id": [],          # store mal_id of Anime
    "title_default": [],   # store default title of Anime
    "title_english": [],   # store English title of Anime
    "title_synonyms": [],  # store Synonyms of title of Anime
    "synopsis": [],        # store description of Anime
    "season": [],          # store season of Anime
    "genres": [],          # store genres of Anime
}

page = 1   # pages counter as we are getting data of anime's w.r.t pages

# an initial request to get "has_next_page" bool value which will be acting as a controller for while loop
response = requests.request("GET", base_url+"anime")   # API response after hitting the base_url+anime
response = json.loads(response.text)                   # load response in json for better interoperability
next_page = response["pagination"]["has_next_page"]    # next_page controller
 
try:
    while next_page:
        print("URL :- ",base_url+'anime?page='+str(page)) # acting as a chceker what URL we are hitting
        response = requests.request("GET", base_url+'anime?page='+str(page))   # hitting the URL w.r.t page number
        response = json.loads(response.text)
        pagination = response["pagination"]
        data = response["data"]     # getting data from response

        # unloading data from data
        for i in range(len(data)):
            # appendning data in animes Dictionary
            animes["mal_id"].append(data[i]["mal_id"])               
            animes["title_default"].append(data[i]["title"])
            animes["title_english"].append(data[i]["title_english"])
            # unpacking title_synonyms into unpacked_titles
            titles = data[i]["title_synonyms"]
            unpacked_titles = [] 
            for _ in titles:
                unpacked_titles.append(_)
            animes["title_synonyms"].append(" ".join(unpacked_titles))
            
            animes["synopsis"].append(data[i]["synopsis"])
            animes["season"].append(data[i]["season"])
            genres = data[i]["genres"]

            # unpacking genres into genres_fetched
            genres_fetched = []
            for j in genres:
                genres_fetched.append(j["name"])
            animes["genres"].append(" ".join(genres_fetched))
            
        page += 1     # incrementing page counter by 1 
        time.sleep(0.5)  # Wait for 2 seconds to make another request
except Exception as e:
    print(e)
print("Task Completed")


URL :-  https://api.jikan.moe/v4/anime?page=1
URL :-  https://api.jikan.moe/v4/anime?page=2
URL :-  https://api.jikan.moe/v4/anime?page=3
URL :-  https://api.jikan.moe/v4/anime?page=4
URL :-  https://api.jikan.moe/v4/anime?page=5
URL :-  https://api.jikan.moe/v4/anime?page=6
URL :-  https://api.jikan.moe/v4/anime?page=7
URL :-  https://api.jikan.moe/v4/anime?page=8
URL :-  https://api.jikan.moe/v4/anime?page=9
URL :-  https://api.jikan.moe/v4/anime?page=10
URL :-  https://api.jikan.moe/v4/anime?page=11
Task Completed


## Creating DataFrame

In [37]:
df = pd.DataFrame(animes)

In [38]:
df.head()

Unnamed: 0,mal_id,title_default,title_english,title_synonyms,synopsis,season,genres
0,1,Cowboy Bebop,Cowboy Bebop,,"Crime is timeless. By the year 2071, humanity ...",spring,Action Award Winning Sci-Fi
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,Cowboy Bebop: Knockin' on Heaven's Door,"Another day, another bounty—such is the life o...",,Action Sci-Fi
2,6,Trigun,Trigun,,"Vash the Stampede is the man with a $$60,000,0...",spring,Action Adventure Sci-Fi
3,7,Witch Hunter Robin,Witch Hunter Robin,WHR,"Though hidden away from the general public, Wi...",summer,Action Drama Mystery Supernatural
4,8,Bouken Ou Beet,Beet the Vandel Buster,Adventure King Beet,It is the dark century and the people are suff...,fall,Action Adventure Fantasy


## Modelling the Data

In [None]:
df["title_synonyms"] = df["title_synonyms"].apply(text_cleaner)
df["title_synonyms"] = df["title_synonyms"].replace("", np.nan)

df = df.dropna(subset = ["synopsis"])
df["title_english"] = df.apply(handle_english_title, axis =1)
df["title_synonyms"] = df.apply(handle_title_synonyms, axis = 1)
df["updated_synopsis"] = df.apply(update_synopsis, axis = 1)
df["updated_synopsis"] = df["updated_synopsis"].apply(clean_synopsis)
df["updated_synopsis"] = df["updated_synopsis"].apply(preprocess_text)
df.reset_index(inplace = True)
df.drop(columns=["index"], inplace=True)

In [41]:
df

Unnamed: 0,mal_id,title_default,title_english,title_synonyms,synopsis,season,genres
0,1,Cowboy Bebop,Cowboy Bebop,,"Crime is timeless. By the year 2071, humanity ...",spring,Action Award Winning Sci-Fi
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,Cowboy Bebop Knockin on Heaven s Door,"Another day, another bounty—such is the life o...",,Action Sci-Fi
2,6,Trigun,Trigun,,"Vash the Stampede is the man with a $$60,000,0...",spring,Action Adventure Sci-Fi
3,7,Witch Hunter Robin,Witch Hunter Robin,WHR,"Though hidden away from the general public, Wi...",summer,Action Drama Mystery Supernatural
4,8,Bouken Ou Beet,Beet the Vandel Buster,Adventure King Beet,It is the dark century and the people are suff...,fall,Action Adventure Fantasy
...,...,...,...,...,...,...,...
270,294,Divergence Eve,,,"In the 24th Century, Intergalactic Space Trave...",summer,Adventure Drama Sci-Fi
271,295,Divergence Eve 2: Misaki Chronicles,Misaki Chronicle: Divergence Eve,,"Through the long distance warp called the ""Exo...",winter,Adventure Drama Sci-Fi
272,296,Dragon Drive,Dragon Drive,,"If there's one word to describe Reiji Ozora, i...",summer,Action Adventure Fantasy Sci-Fi
273,297,Grenadier: Hohoemi no Senshi,Grenadier: The Beautiful Warrior,Grenadier The Smiling Senshi,Rushuna is a blonde and very beautiful Senshi ...,fall,Action Adventure Comedy Ecchi


## Saving the Data in a csv file

In [None]:
df.to_csv("modified_data.csv")

## Importing the raw data

In [None]:
df = pd.read_csv("/kaggle/input/anime-raw-data/modified_data.csv")
df.drop(columns=["Unnamed: 0"], inplace = True)

In [None]:
df.head()

### SBERT model

In [8]:
# Load model and move it to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2").to(device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
print(datetime.datetime.now().time())
df["embedding"] = df["updated_synopsis"].apply(lambda x: model.encode(x, show_progress_bar=False))
print(datetime.datetime.now().time())

### Sava DataFrame in pickle file

In [None]:
df.to_pickle("Anime_embed_data.pkl")

### load Data from pickle File

In [4]:
df = pd.read_pickle("/kaggle/input/anime-raw-data/Anime_embed_data.pkl")

### Creating Embedding matrix of embeddings column which will be used to find similarity with the user query

In [5]:
embedding_matrix = np.vstack(df["embedding"].values)

### Prediction Pipeline

In [None]:
def prediction_pipeline(user_query):
    user_query_embedding = get_embedding(user_query)  # User Query will be passed to get_embedding function where preprocessing of text will be carried and SBERT encoding will be returned for that query
    # Compute similarity scores
    similarities = cosine_similarity([user_query_embedding], embedding_matrix)[0]

    # Get top K recommendations
    top_k_indices = np.argsort(similarities)[::-1][:5]
    recommended_anime = df.iloc[top_k_indices][["title_default", "embedding"]]  # title will be used as labels for Explainability, embedding to plot the Anime 
    return recommended_anime

In [None]:
user_input = "a guys finds a notebook that can kill people by writing their name and death reason on it"
recommendations = prediction_pipeline(user_input)
recommendations

Unnamed: 0,title_default,embedding
2709,Death Note: Rewrite,"[-0.030127326, 0.010555047, -0.0416725, -0.070..."
1393,Death Note,"[-0.0029539862, -0.01401821, -0.06349944, -0.0..."
23017,Seikatsu wa Psychopath no Shi no You ni,"[0.010736803, 0.017244874, -0.012422802, 0.020..."
19078,Human Bug Daigaku,"[0.0043858653, -0.017569505, -0.113012776, 0.0..."
2665,Seisai,"[-0.04038861, 0.049404167, -0.014107146, -0.01..."
