In [1]:
import numpy as np
import pandas as pd
import ast
import re
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
import gc
import psutil
from tqdm import tqdm
import os

In [2]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wwwsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wwwsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wwwsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)

In [4]:
raw_data = pd.read_csv("manga_new.csv")
raw_data.columns

Index(['manga_id', 'title', 'type', 'score', 'scored_by', 'status', 'volumes',
       'chapters', 'start_date', 'end_date', 'members', 'favorites', 'sfw',
       'approved', 'created_at_before', 'updated_at', 'real_start_date',
       'real_end_date', 'genres', 'themes', 'demographics', 'authors',
       'serializations', 'synopsis', 'background', 'main_picture', 'url',
       'title_english', 'title_japanese', 'title_synonyms'],
      dtype='object')

In [5]:
raw_data.head(1)

Unnamed: 0,manga_id,title,type,score,scored_by,status,volumes,chapters,start_date,end_date,members,favorites,sfw,approved,created_at_before,updated_at,real_start_date,real_end_date,genres,themes,demographics,authors,serializations,synopsis,background,main_picture,url,title_english,title_japanese,title_synonyms
0,2,Berserk,manga,9.47,319696,currently_publishing,,,1989-08-25,,643969,119470,True,True,2007-07-17 20:14:45+00:00,2023-04-01 00:19:31+00:00,1989-08-25,,"['Action', 'Adventure', 'Award Winning', 'Dram...","['Gore', 'Military', 'Mythology', 'Psychologic...",['Seinen'],"[{'id': 1868, 'first_name': 'Kentarou', 'last_...",['Young Animal'],"Guts, a former mercenary now known as the ""Bla...",Berserk won the Award for Excellence at the si...,https://cdn.myanimelist.net/images/manga/1/157...,https://myanimelist.net/manga/2/Berserk,Berserk,ベルセルク,['Berserk: The Prototype']


In [6]:
raw_data.loc[:, "main_picture"].head(2)

0    https://cdn.myanimelist.net/images/manga/1/157...
1    https://cdn.myanimelist.net/images/manga/2/253...
Name: main_picture, dtype: object

In [7]:
raw_data = raw_data[~raw_data["type"].isin(["light_novel", "novel", "doujinshi"])]
raw_data = raw_data[~raw_data["sfw"].isin([False])]
raw_data["type"].value_counts()

type
manga       31580
one_shot     2917
manhwa       2652
manhua        343
Name: count, dtype: int64

In [8]:
raw_data.shape

(37492, 30)

In [9]:
raw_data = raw_data.sort_values(by="favorites", ascending=False).head(15500)


In [10]:
all_manga = raw_data[
    [
        "manga_id",
        "title",
        "title_synonyms",
        "synopsis",
        "demographics",
        "authors",
        "genres",
        "themes",
        "type",
    ]
]

In [11]:
all_manga = all_manga.reset_index(drop=True)

In [12]:
all_manga.head(2)

Unnamed: 0,manga_id,title,title_synonyms,synopsis,demographics,authors,genres,themes,type
0,2,Berserk,['Berserk: The Prototype'],"Guts, a former mercenary now known as the ""Bla...",['Seinen'],"[{'id': 1868, 'first_name': 'Kentarou', 'last_...","['Action', 'Adventure', 'Award Winning', 'Dram...","['Gore', 'Military', 'Mythology', 'Psychologic...",manga
1,13,One Piece,[],"Gol D. Roger, a man referred to as the ""King o...",['Shounen'],"[{'id': 1881, 'first_name': 'Eiichiro', 'last_...","['Action', 'Adventure', 'Fantasy']",[],manga


In [13]:
all_manga = all_manga.dropna()

In [14]:
all_manga.shape

(14483, 9)

In [15]:
def clean_text(text):
    # Remove anything after '[Written by MAL Rewrite]'
    text = re.split(r"\[Written by MAL Rewrite\]", text)[0]

    # Use a regular expression to find text between any type of brackets and remove it
    cleaned_text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}|\<.*?\>", "", text)
    return cleaned_text

In [16]:
all_manga.loc[:, "synopsis"] = all_manga["synopsis"].apply(clean_text)
all_manga.loc[:, "synopsis"] = all_manga["synopsis"].apply(
    lambda x: x.translate(str.maketrans("", "", string.punctuation))
)

all_manga.loc[:, "title_synonyms"] = all_manga["title_synonyms"].apply(
    lambda x: x.translate(str.maketrans("", "", string.punctuation))
)

In [17]:
all_manga.head(2)

Unnamed: 0,manga_id,title,title_synonyms,synopsis,demographics,authors,genres,themes,type
0,2,Berserk,Berserk The Prototype,Guts a former mercenary now known as the Black...,['Seinen'],"[{'id': 1868, 'first_name': 'Kentarou', 'last_...","['Action', 'Adventure', 'Award Winning', 'Dram...","['Gore', 'Military', 'Mythology', 'Psychologic...",manga
1,13,One Piece,,Gol D Roger a man referred to as the King of t...,['Shounen'],"[{'id': 1881, 'first_name': 'Eiichiro', 'last_...","['Action', 'Adventure', 'Fantasy']",[],manga


In [18]:
all_manga.loc[:, "synopsis"] = all_manga["synopsis"].apply(lambda x: x.split())
all_manga.loc[:, "type"] = all_manga["type"].apply(lambda x: x.split())

In [19]:
all_manga.head(2)

Unnamed: 0,manga_id,title,title_synonyms,synopsis,demographics,authors,genres,themes,type
0,2,Berserk,Berserk The Prototype,"[Guts, a, former, mercenary, now, known, as, t...",['Seinen'],"[{'id': 1868, 'first_name': 'Kentarou', 'last_...","['Action', 'Adventure', 'Award Winning', 'Dram...","['Gore', 'Military', 'Mythology', 'Psychologic...",[manga]
1,13,One Piece,,"[Gol, D, Roger, a, man, referred, to, as, the,...",['Shounen'],"[{'id': 1881, 'first_name': 'Eiichiro', 'last_...","['Action', 'Adventure', 'Fantasy']",[],[manga]


In [20]:
for cols in ["demographics", "authors", "genres", "themes"]:
    all_manga.loc[:, cols] = all_manga[cols].apply(ast.literal_eval)

In [21]:
def resolve_author(data):
    res = []
    for dic in data:
            res.append(f"{dic["last_name"]} {dic['first_name']}" if dic['first_name'] != '' else dic['last_name'])
    return res

def remove_spaces(data):
      res = []
      for i in data:
            res.append(i.replace(" ",''))
      return res

In [22]:
all_manga.loc[:, "authors"] = all_manga["authors"].apply(resolve_author)
all_manga.loc[:, "authors"] = all_manga["authors"].apply(remove_spaces)

In [23]:
all_manga.head(50).to_json(orient="records")

'[{"manga_id":2,"title":"Berserk","title_synonyms":"Berserk The Prototype","synopsis":["Guts","a","former","mercenary","now","known","as","the","Black","Swordsman","is","out","for","revenge","After","a","tumultuous","childhood","he","finally","finds","someone","he","respects","and","believes","he","can","trust","only","to","have","everything","fall","apart","when","this","person","takes","away","everything","important","to","Guts","for","the","purpose","of","fulfilling","his","own","desires","Now","marked","for","death","Guts","becomes","condemned","to","a","fate","in","which","he","is","relentlessly","pursued","by","demonic","beings","Setting","out","on","a","dreadful","quest","riddled","with","misfortune","Guts","armed","with","a","massive","sword","and","monstrous","strength","will","let","nothing","stop","him","not","even","death","itself","until","he","is","finally","able","to","take","the","head","of","the","one","who","stripped","him\\u2014and","his","loved","one\\u2014of","thei

In [24]:
all_manga.loc[:, "title_synonyms"] = all_manga["title_synonyms"].apply(
    lambda x: x.split()
)

In [25]:
all_manga.head(2)

Unnamed: 0,manga_id,title,title_synonyms,synopsis,demographics,authors,genres,themes,type
0,2,Berserk,"[Berserk, The, Prototype]","[Guts, a, former, mercenary, now, known, as, t...",[Seinen],"[MiuraKentarou, StudioGaga]","[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]",[manga]
1,13,One Piece,[],"[Gol, D, Roger, a, man, referred, to, as, the,...",[Shounen],[OdaEiichiro],"[Action, Adventure, Fantasy]",[],[manga]


In [26]:
def concatenate_row(row):
    concatenated_row = ""
    for values in row[2:]:
        if not values:
            continue
        concatenated_row += " ".join(values) + ""
    return concatenated_row.strip()

In [27]:
all_manga["tags"] = all_manga.apply(concatenate_row, axis=1)

In [28]:
json = all_manga.iloc[:, [0, 1]].to_json(orient="records")
with open("title.json", "w") as f:
    f.write(json)

In [29]:
manga = all_manga[["manga_id", "title", "tags"]]

In [30]:
manga.head()

Unnamed: 0,manga_id,title,tags
0,2,Berserk,Berserk The PrototypeGuts a former mercenary n...
1,13,One Piece,Gol D Roger a man referred to as the King of t...
2,116778,Chainsaw Man,Denji has a simple dream—to live a happy and p...
3,23390,Shingeki no Kyojin,Hundreds of years ago horrifying creatures whi...
4,4632,Oyasumi Punpun,Punpun Onodera is a normal 11yearold boy livin...


In [31]:
manga.loc[:, "tags"] = manga["tags"].apply(lambda x: x.replace("—", " "))

In [32]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def cleanup(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to string
    cleaned_text = " ".join(tokens)
    return cleaned_text

In [33]:
manga.head()

Unnamed: 0,manga_id,title,tags
0,2,Berserk,Berserk The PrototypeGuts a former mercenary n...
1,13,One Piece,Gol D Roger a man referred to as the King of t...
2,116778,Chainsaw Man,Denji has a simple dream to live a happy and p...
3,23390,Shingeki no Kyojin,Hundreds of years ago horrifying creatures whi...
4,4632,Oyasumi Punpun,Punpun Onodera is a normal 11yearold boy livin...


In [34]:
manga.loc[:, "tags"] = manga["tags"].apply(cleanup)

In [35]:
# Function to get memory usage
def get_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / (1024**2)  # Convert bytes to MB

In [36]:
manga = manga.reset_index(drop=True)

In [37]:
# compute similarity matrix in smaller batches due to memory limitations
cv = CountVectorizer(max_features=20000)

In [38]:
vectors = cv.fit_transform(manga['tags'])

In [39]:
similarity = cosine_similarity(vectors)

In [40]:
similarity.shape

(14483, 14483)

In [41]:
# save similarity matrix as a pickle file
pickle.dump(similarity, open("artifacts/mat-20000.pkl", "wb"))

In [42]:
pickle.dump(manga, open("artifacts/data.pkl", "wb"))

In [304]:
manga.head()

Unnamed: 0,manga_id,title,tags
0,2,Berserk,berserk prototypeguts former mercenary known b...
1,13,One Piece,gol roger man referred king pirate set execute...
2,116778,Chainsaw Man,denji simple dream live happy peaceful life sp...
3,23390,Shingeki no Kyojin,hundred year ago horrifying creature resembled...
4,4632,Oyasumi Punpun,punpun onodera normal 11yearold boy living jap...


In [305]:
manga.iloc[0]["title"]

'Berserk'

In [306]:
gc.collect()

924

In [310]:
def recommend(title):
    filtered_manga = manga[manga['title'].str.lower() == title.lower()]
    if len(filtered_manga) == 0:
        print (f"{title} not found in the database.")
        return
    
    index = filtered_manga.index[0]

    sorted_Manga = sorted(list(enumerate(similarity[index])),reverse=True , key=lambda x: x[1])

    recommended_titles = []
    recommended_urls = []
    similarities = []

    for i in sorted_Manga[:100]:
        recommended_titles.append(manga.iloc[i[0]]["title"])

    return recommended_titles


recommend("Nanatsu no Taizai")


['Nanatsu no Taizai',
 'Umi no Kishidan',
 'The Legend of the Sun Knight',
 'Shoujo Kishidan x Knight Tale',
 'Henkyou no Roukishi Bard Loen',
 'Eiyuuou, Bu wo Kiwameru Tame Tenseisu: Soshite, Sekai Saikyou no Minarai Kishi♀',
 'A Fairytale for the Demon Lord',
 "The Dark Lord's Confession",
 'Majo no Moribito',
 'Engage Knight',
 'The Red Knight Seeks No Reward',
 'The Legend of Dragoon',
 'Shin Shirayuki-hime Densetsu Pretear',
 'Entaku no Himeshi!',
 'Hagure Seireii no Shinsatsu Kiroku: Seijo Kishidan to Iyashi no Kamiwaza',
 'Mayoe! Nanatsu no Taizai Gakuen!',
 'The Legendary Spearman Returns',
 'Hao Taikei Ryu Knight',
 'The Knight and Her Emperor',
 'Rakudai Kishi no Cavalry',
 'Okobore Hime to Entaku no Kishi',
 'Black Clover Gaiden: Quartet Knights',
 'Tensei shita Daiseijo wa, Seijo de Aru Koto wo Hitakakusu',
 'X & Ash',
 'Nanatsu no Taizai: Hajimari wo Sasou Ame no Mori',
 'Aqua Knight',
 'Ryuukishi no Okiniiri',
 'Douyara Boku no Hanayome wa Onna Kishidan na You de.',
 'Sti