In [48]:
import numpy as np
import pandas as pd
import ast
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
import gc
import psutil
from tqdm import tqdm
import os

In [49]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wwwsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wwwsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wwwsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)

In [51]:
raw_data = pd.read_csv("manga_new.csv")
raw_data.shape

(64833, 30)

In [52]:
raw_data[raw_data['title'].str.lower() == 'oshi no ko']

Unnamed: 0,manga_id,title,type,score,scored_by,status,volumes,chapters,start_date,end_date,members,favorites,sfw,approved,created_at_before,updated_at,real_start_date,real_end_date,genres,themes,demographics,authors,serializations,synopsis,background,main_picture,url,title_english,title_japanese,title_synonyms


In [53]:
raw_data = raw_data[~raw_data['type'].isin(['light_novel', 'novel', 'doujinshi'])]
raw_data = raw_data[~raw_data["sfw"].isin([False])]
raw_data['type'].value_counts()

type
manga       31580
one_shot     2917
manhwa       2652
manhua        343
Name: count, dtype: int64

In [54]:
raw_data.shape

(37492, 30)

raw_data = raw_data.sort_values(by="favorites", ascending=False).head(15500)

In [55]:
all_manga = raw_data[
    [
        "manga_id",
        "title",
        "title_synonyms",
        "synopsis",
        "demographics",
        "authors",
        "genres",
        "themes",
        "type",
    ]
]

In [56]:
all_manga = all_manga.reset_index(drop=True)

In [57]:
all_manga.head(2)

Unnamed: 0,manga_id,title,title_synonyms,synopsis,demographics,authors,genres,themes,type
0,2,Berserk,['Berserk: The Prototype'],"Guts, a former mercenary now known as the ""Bla...",['Seinen'],"[{'id': 1868, 'first_name': 'Kentarou', 'last_...","['Action', 'Adventure', 'Award Winning', 'Dram...","['Gore', 'Military', 'Mythology', 'Psychologic...",manga
1,13,One Piece,[],"Gol D. Roger, a man referred to as the ""King o...",['Shounen'],"[{'id': 1881, 'first_name': 'Eiichiro', 'last_...","['Action', 'Adventure', 'Fantasy']",[],manga


In [58]:

all_manga = all_manga.dropna()

In [59]:
all_manga.shape

(29520, 9)

In [60]:
def clean_text(text):
    # Remove anything after '[Written by MAL Rewrite]'
    text = re.split(r"\[Written by MAL Rewrite\]", text)[0]

    # Use a regular expression to find text between any type of brackets and remove it
    cleaned_text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}|\<.*?\>", "", text)
    return cleaned_text



In [61]:
all_manga.loc[:, "synopsis"] = all_manga['synopsis'].apply(clean_text)
all_manga.loc[:,  "synopsis"] = all_manga['synopsis'].apply(lambda x : x.translate(str.maketrans("", "", string.punctuation)))

all_manga.loc[:,  "title_synonyms"] = all_manga['title_synonyms'].apply(lambda x : x.translate(str.maketrans("", "", string.punctuation)))


In [62]:
all_manga.loc[:, "synopsis"] = all_manga["synopsis"].apply(lambda x: x.split())
all_manga.loc[:, "type"] = all_manga["type"].apply(lambda x: x.split())

In [63]:
for cols in ["demographics", "authors", "genres", "themes"]:
    all_manga.loc[:, cols] = all_manga[cols].apply(ast.literal_eval)

In [64]:
def resolve_author(data):
    res = []
    for dic in data:
            res.append(f"{dic["last_name"]} {dic['first_name']}" if dic['first_name'] != '' else dic['last_name'])
    return res

def remove_spaces(data):
      res = []
      for i in data:
            res.append(i.replace(" ",''))
      return res

In [65]:
all_manga.loc[:, "authors"] = all_manga["authors"].apply(resolve_author)
all_manga.loc[:, "authors"] = all_manga["authors"].apply(remove_spaces)

In [66]:
all_manga.loc[:, "title_synonyms"] = all_manga['title_synonyms'].apply(lambda x:x.split() )

In [67]:
def concatenate_row(row):
    concatenated_row = ""
    for values in row[2:]:
        if not values:
            continue
        concatenated_row += " ".join(values) + " "
    return concatenated_row.strip()

In [68]:
all_manga["tags"] = all_manga.apply(concatenate_row, axis=1)

In [69]:
all_manga.head()

Unnamed: 0,manga_id,title,title_synonyms,synopsis,demographics,authors,genres,themes,type,tags
0,2,Berserk,"[Berserk, The, Prototype]","[Guts, a, former, mercenary, now, known, as, t...",[Seinen],"[MiuraKentarou, StudioGaga]","[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]",[manga],Berserk The Prototype Guts a former mercenary ...
1,13,One Piece,[],"[Gol, D, Roger, a, man, referred, to, as, the,...",[Shounen],[OdaEiichiro],"[Action, Adventure, Fantasy]",[],[manga],Gol D Roger a man referred to as the King of t...
2,1706,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,"[JoJos, Bizarre, Adventure, Part, 7, Steel, Ba...","[In, the, American, Old, West, the, worlds, gr...","[Seinen, Shounen]",[ArakiHirohiko],"[Action, Adventure, Mystery, Supernatural]",[Historical],[manga],JoJos Bizarre Adventure Part 7 Steel Ball Run ...
3,4632,Oyasumi Punpun,[],"[Punpun, Onodera, is, a, normal, 11yearold, bo...",[Seinen],[AsanoInio],"[Drama, Slice of Life]",[Psychological],[manga],Punpun Onodera is a normal 11yearold boy livin...
4,25,Fullmetal Alchemist,"[Full, Metal, Alchemist, Hagane, no, Renkinjut...","[Alchemists, are, knowledgeable, and, naturall...",[Shounen],[ArakawaHiromu],"[Action, Adventure, Award Winning, Drama, Fant...",[Military],[manga],Full Metal Alchemist Hagane no Renkinjutsushi ...


In [70]:
manga = all_manga[['manga_id','title','tags']]

In [71]:
manga.head()

Unnamed: 0,manga_id,title,tags
0,2,Berserk,Berserk The Prototype Guts a former mercenary ...
1,13,One Piece,Gol D Roger a man referred to as the King of t...
2,1706,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,JoJos Bizarre Adventure Part 7 Steel Ball Run ...
3,4632,Oyasumi Punpun,Punpun Onodera is a normal 11yearold boy livin...
4,25,Fullmetal Alchemist,Full Metal Alchemist Hagane no Renkinjutsushi ...


In [72]:
manga.loc[:,'tags']=manga['tags'].apply(lambda x: x.replace("—"," "))

In [73]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def cleanup(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to string
    cleaned_text = " ".join(tokens)
    return cleaned_text


In [74]:
manga.loc[:,'tags']=manga['tags'].apply(cleanup)

In [75]:
# Function to get memory usage
def get_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / (1024**2)  # Convert bytes to MB

In [76]:
# compute similarity matrix in smaller batches due to memory limitations
cv = CountVectorizer(max_features=20000)
vectors = cv.fit_transform(manga["tags"]).toarray()

gc.collect()

batch_size = 1000  # Adjust this value based on your available memory

# Assuming 'vectors' is your count vectorized data
n_samples = vectors.shape[0]
similarity_matrix = np.zeros((n_samples, n_samples))

# tqdm = tqdm(range(0, n_samples,batch_size), desc="Processing Batches")

for start in tqdm(range(0, n_samples, batch_size), desc="Processing Batches"):
    end = min(start + batch_size, n_samples)
    batch_vectors = vectors[start:end]

    # Compute similarity for the current
    batch_similarity = cosine_similarity(batch_vectors, vectors)

    # Update the similarity matrix
    similarity_matrix[start:end, :] = batch_similarity

    # Memory usage before garbage collection
    mem_before_gc = get_memory_usage()

    # Clean up memory after processing the batch
    del batch_vectors, batch_similarity
    gc.collect()

    # Memory usage after garbage collection
    mem_after_gc = get_memory_usage()

    # Print memory usage information
    print(f"before GC: {mem_before_gc:.2f} MB -- after GC: {mem_after_gc:.2f} MB")
    
# save similarity matrix as a pickle file
pickle.dump(similarity_matrix, open("artifacts/mat-20000.pkl", "wb"))

Processing Batches:   3%|▎         | 1/30 [00:47<23:08, 47.88s/it]

before GC: 464.34 MB -- after GC: 742.27 MB


Processing Batches:   7%|▋         | 2/30 [01:40<23:34, 50.53s/it]

before GC: 463.85 MB -- after GC: 742.10 MB


Processing Batches:  10%|█         | 3/30 [02:38<24:24, 54.22s/it]

before GC: 462.88 MB -- after GC: 741.62 MB


Processing Batches:  13%|█▎        | 4/30 [03:35<23:50, 55.01s/it]

before GC: 463.89 MB -- after GC: 742.10 MB


Processing Batches:  17%|█▋        | 5/30 [04:31<23:05, 55.43s/it]

before GC: 463.87 MB -- after GC: 742.14 MB


Processing Batches:  20%|██        | 6/30 [05:18<21:00, 52.53s/it]

before GC: 464.21 MB -- after GC: 742.21 MB


Processing Batches:  23%|██▎       | 7/30 [06:05<19:29, 50.85s/it]

before GC: 463.89 MB -- after GC: 742.15 MB


Processing Batches:  27%|██▋       | 8/30 [07:01<19:14, 52.47s/it]

before GC: 464.22 MB -- after GC: 742.21 MB


Processing Batches:  30%|███       | 9/30 [07:52<18:13, 52.07s/it]

before GC: 464.20 MB -- after GC: 742.20 MB


Processing Batches:  33%|███▎      | 10/30 [08:48<17:43, 53.17s/it]

before GC: 463.82 MB -- after GC: 742.09 MB


Processing Batches:  37%|███▋      | 11/30 [09:38<16:30, 52.14s/it]

before GC: 464.20 MB -- after GC: 742.24 MB


Processing Batches:  40%|████      | 12/30 [10:35<16:05, 53.67s/it]

before GC: 463.83 MB -- after GC: 742.09 MB


Processing Batches:  43%|████▎     | 13/30 [11:32<15:31, 54.82s/it]

before GC: 463.82 MB -- after GC: 742.11 MB


Processing Batches:  47%|████▋     | 14/30 [12:36<15:17, 57.37s/it]

before GC: 462.81 MB -- after GC: 741.59 MB


Processing Batches:  50%|█████     | 15/30 [13:42<15:01, 60.08s/it]

before GC: 464.15 MB -- after GC: 742.23 MB


Processing Batches:  53%|█████▎    | 16/30 [15:02<15:24, 66.00s/it]

before GC: 464.10 MB -- after GC: 742.20 MB


Processing Batches:  57%|█████▋    | 17/30 [16:17<14:56, 68.95s/it]

before GC: 464.12 MB -- after GC: 742.23 MB


Processing Batches:  60%|██████    | 18/30 [17:28<13:54, 69.57s/it]

before GC: 464.10 MB -- after GC: 742.19 MB


Processing Batches:  63%|██████▎   | 19/30 [18:41<12:54, 70.38s/it]

before GC: 463.79 MB -- after GC: 742.11 MB


Processing Batches:  67%|██████▋   | 20/30 [20:01<12:13, 73.35s/it]

before GC: 464.12 MB -- after GC: 742.21 MB


Processing Batches:  70%|███████   | 21/30 [21:14<10:58, 73.14s/it]

before GC: 464.15 MB -- after GC: 742.23 MB


Processing Batches:  73%|███████▎  | 22/30 [22:33<09:59, 74.89s/it]

before GC: 464.14 MB -- after GC: 737.29 MB


Processing Batches:  77%|███████▋  | 23/30 [23:52<08:52, 76.12s/it]

before GC: 464.14 MB -- after GC: 742.23 MB


Processing Batches:  80%|████████  | 24/30 [25:19<07:56, 79.42s/it]

before GC: 464.09 MB -- after GC: 742.19 MB


Processing Batches:  83%|████████▎ | 25/30 [26:53<06:59, 83.88s/it]

before GC: 464.13 MB -- after GC: 742.22 MB


Processing Batches:  87%|████████▋ | 26/30 [28:09<05:26, 81.52s/it]

before GC: 464.10 MB -- after GC: 742.19 MB


Processing Batches:  90%|█████████ | 27/30 [29:31<04:04, 81.58s/it]

before GC: 464.12 MB -- after GC: 742.23 MB


Processing Batches:  93%|█████████▎| 28/30 [30:52<02:42, 81.42s/it]

before GC: 464.21 MB -- after GC: 742.27 MB


Processing Batches:  97%|█████████▋| 29/30 [32:23<01:24, 84.36s/it]

before GC: 464.13 MB -- after GC: 742.22 MB


Processing Batches: 100%|██████████| 30/30 [33:49<00:00, 67.66s/it]

before GC: 247.41 MB -- after GC: 633.61 MB





# load similarity matrix from a pickle file
similarity_matrix = pd.read_pickle('artifacts/mat-10000.pkl')

In [77]:
gc.collect()

0

In [91]:
def recommend(title):
    title = title.lower()
    filtered_manga = manga[manga["title"].str.lower() == title]
    if filtered_manga.empty:
        print(f"{title.title()} not found in the database.")
        return

    index = filtered_manga.index[0]

    sorted_manga_by_similarity = sorted(
        list(enumerate(similarity_matrix[index])), reverse=True, key=lambda x: x[1]
    )

    recommended_titles = []
    recommended_urls = []
    similarities = []
    for i in sorted_manga_by_similarity[:100]:
        recommended_titles.append(manga.iloc[i[0]]['title'])
        recommended_urls.append(f"https://myanimelist.net/manga/{manga.iloc[i[0]]['manga_id']}")
        similarities.append(f"{round(i[1] * 100,1)}%")

    recommendations_df = pd.DataFrame({
        'Title': recommended_titles,
        'URL': recommended_urls,
        'Similarity': similarities
    })
    # return recommendations_df

    filtered_df = recommendations_df[~recommendations_df['Title'].str.lower().str.contains(title)][:5]
    return filtered_df


In [92]:
recommend("one piece")

Unnamed: 0,Title,URL,Similarity
0,One Piece,https://myanimelist.net/manga/13,100.0%
1,One Piece: Strong World,https://myanimelist.net/manga/17152,41.1%
2,Koisuru One Piece,https://myanimelist.net/manga/120401,37.9%
3,King of Viking,https://myanimelist.net/manga/25681,31.6%
4,Romance Dawn,https://myanimelist.net/manga/5114,31.1%
5,Sunset Rose,https://myanimelist.net/manga/79237,30.8%
6,Jisshoku! Akuma no Mi!!,https://myanimelist.net/manga/25146,29.2%
7,Tramp.,https://myanimelist.net/manga/24123,28.8%
8,Kaizoku Hime: Captain Rose no Bouken,https://myanimelist.net/manga/34267,28.5%
9,"Maou Taiji, Shitenai Hou.",https://myanimelist.net/manga/152181,28.2%
