In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from keras.metrics import Recall
from gensim.models import Word2Vec

import pickle
import spacy
nlp = spacy.load("en_core_web_sm")

from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('D:/YNOV/M1/NLP/Projet/DataSetMangaGenre.csv')
df = df.drop(columns=['title','status','demographic','content_rating'])

All check on data not being empty done during export

In [3]:
df.head()

Unnamed: 0,description,Action,Adventure,Boys' Love,Comedy,Crime,Drama,Fantasy,Girls' Love,Historical,...,Philosophical,Psychological,Romance,Sci-Fi,Slice of Life,Sports,Superhero,Thriller,Tragedy,Wuxia
0,Shinichi Kudo is a high school detective who s...,1,1,0,1,1,1,0,0,0,...,0,1,1,0,0,0,0,0,0,0
1,Takasu Ryuuji has learned the hard way that ap...,0,0,0,1,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,Rin and his exorcist classmates are caught in ...,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Yotsuba is a strange little girl with a big pe...,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,Welcome to a world where mysticism and science...,1,0,0,1,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0


In [4]:
def clean_text(text):
    text = str(text).lower()  
    text = re.sub(r"http\S+", "", text)  
    text = re.sub(r"[^\w\s]", "", text)

    text = nlp(text)
    text = " ".join([token.lemma_ for token in text if not token.is_stop])
    return text


df['description'] = df['description'].apply(clean_text)

In [None]:
max_words = 10000  
max_len = 200  

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['description'])

token = tokenizer.texts_to_sequences(df['description'])
X = pad_sequences(token, maxlen=max_len)


y = df.drop(columns=['description'])

with open('tokenizer_Word2Vec_Genre.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
df['tokens'] = df['description'].str.split()

w2v_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=2)

embedding_dim = 100  


embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i >= max_words:
        continue
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]


In [22]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len,
                        weights=[embedding_matrix], trainable=False))  
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(y.shape[1], activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=Adam(1e-3), metrics=['binary_accuracy', Recall()])

stop = EarlyStopping(patience=3, restore_best_weights=True)

model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_test, y_test), callbacks=[stop])

Epoch 1/15




[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 90ms/step - binary_accuracy: 0.8379 - loss: 0.3655 - recall_1: 0.3284 - val_binary_accuracy: 0.8785 - val_loss: 0.2795 - val_recall_1: 0.3071
Epoch 2/15
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 90ms/step - binary_accuracy: 0.8778 - loss: 0.2831 - recall_1: 0.3473 - val_binary_accuracy: 0.8802 - val_loss: 0.2773 - val_recall_1: 0.3363
Epoch 3/15
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 89ms/step - binary_accuracy: 0.8789 - loss: 0.2821 - recall_1: 0.3595 - val_binary_accuracy: 0.8856 - val_loss: 0.2665 - val_recall_1: 0.3906
Epoch 4/15
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 89ms/step - binary_accuracy: 0.8853 - loss: 0.2678 - recall_1: 0.3762 - val_binary_accuracy: 0.8868 - val_loss: 0.2631 - val_recall_1: 0.3665
Epoch 5/15
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 90ms/step - binary_accuracy: 0.8845 - loss: 0.2688 - 

<keras.src.callbacks.history.History at 0x21c503aab40>

In [23]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

rep = classification_report(y_test, y_pred, target_names=y.columns,output_dict=True)
rep = pd.DataFrame(rep).transpose()
rep[['precision', 'recall', 'f1-score']] = rep[['precision', 'recall', 'f1-score']].round(2)
rep['support'] = rep['support'].astype(int)


print(rep[(rep['precision'] == 0)])

print('-----------------------------------------------')

print(rep[(rep['precision'] != 0)].sort_values(by='support', ascending=False))

[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step
               precision  recall  f1-score  support
Boys' Love           0.0     0.0       0.0       71
Girls' Love          0.0     0.0       0.0       99
Horror               0.0     0.0       0.0      210
Magical Girls        0.0     0.0       0.0       23
Medical              0.0     0.0       0.0       20
Superhero            0.0     0.0       0.0       25
Wuxia                0.0     0.0       0.0       12
-----------------------------------------------
               precision  recall  f1-score  support
samples avg         0.64    0.46      0.50    10481
weighted avg        0.63    0.42      0.46    10481
macro avg           0.48    0.18      0.21    10481
micro avg           0.66    0.42      0.51    10481
Romance             0.69    0.76      0.72     1624
Comedy              0.63    0.59      0.61     1550
Drama               0.59    0.46      0.52     1364
Action              0.75    0.58      0.65    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
test = [
    {
        "description": 'Komi-san is a beautiful and admirable girl that no one can take their eyes off of. Almost the whole school sees her as the cold beauty thats out of their league, but Tadano Hitohito knows the truth: shes just really bad at communicating with others. Komi-san, who wishes to fix this bad habit of hers, tries to improve it with the help of Tadano-kun by achieving her goal of having 100 friends.',
        "genres": ['Romance', 'Comedy', 'School Life', 'Slice of Life']
    },
    {
        "description": "Guts, known as the Black Swordsman, seeks sanctuary from the demonic forces attracted to him and his woman because of a demonic mark on their necks, and also vengeance against the man who branded him as an unholy sacrifice. Aided only by his titanic strength gained from a harsh childhood lived with mercenaries, a gigantic sword, and an iron prosthetic left hand, Guts must struggle against his bleak destiny, all the while fighting with a rage that might strip him of his humanity. Won the 6th Osamu Tezuka Cultural Prize Excellence Award in 2002.",
        "genres": ['Action', 'Psychological', 'Adventure', 'Philosophical', 'Drama', 'Horror', 'Fantasy', 'Supernatural', 'Tragedy']
    },
    {
        "description": "Gol D. Roger, a man referred to as the 'Pirate King,' is set to be executed by the World Government. But just before his demise, he confirms the existence of a great treasure, One Piece, located somewhere within the vast ocean known as the Grand Line. Announcing that One Piece can be claimed by anyone worthy enough to reach it, the Pirate King is executed and the Great Age of Pirates begins. Twenty-two years later, a young man by the name of Monkey D. Luffy is ready to embark on his own adventure, searching for One Piece and striving to become the new Pirate King. Armed with just a straw hat, a small boat, and an elastic body, he sets out on a fantastic journey to gather his own crew and a worthy ship that will take them across the Grand Line to claim the greatest status on the high seas.",
        "genres": ['Sci-Fi', 'Action', 'Comedy', 'Crime', 'Adventure', 'Drama', 'Fantasy', 'Supernatural']
    },
    {
        "description": "There are gamblers out there who even bet their lives as ante. But to secure the integrity of these life-threatening gambles, a violent and powerful organization by the name of “Kakerou” referees these games as a neutral party. Follow Baku Madarame a.k.a. Usogui (The Lie Eater) as he gambles against maniacal opponents at games – such as Escape the Abandoned Building, Old Maid, and Hangman – to ultimately “out-gamble” and control the neutral organization of Kakerou itself.",
        "genres": ['Thriller', 'Action', 'Psychological', 'Martial Arts', 'Mafia', 'Drama', 'Mystery']
    },
    #Description from outside Mangadex
    {
        "description": "The mysterious bug medicine ‘Jungle juice’ made him an insect human. Jang Su-chan, a college student who used to hide his teeth, one day he looked like that. You’ll be seen by everyone, and you’ll be in the insect world. You fight against the weak-kneed logic…",
        "genres": ['Thriller', 'Sci-Fi', 'Action', 'Superhero', 'Horror', 'Fantasy', 'Supernatural']
    },
    {
        "description": "The Murim Warring States period, is where only the strong survives. This is the war slave Cheong Gwang and Myeong-Wol's Murim Survival Story.",
        "genres": ['Action', 'Martial Arts', 'Adventure', 'Drama', 'Fantasy', 'Tragedy']
    }
]


for idx, val in enumerate(test) :
    valVec = pad_sequences(tokenizer.texts_to_sequences([clean_text(val['description'])]), maxlen = max_len)

    pred = model.predict(valVec)
    pred = (pred > 0.45).astype(int)

    print(f"TEST {idx + 1} : ")

    print("Genre Expected :")
    for genre in sorted(val['genres']) :
        print('\t' + genre)

    print("Genre Predicted :")
    for idx, genre in enumerate(y.columns):
        if pred[0][idx] == 1 :
            print('\t' + genre)
    print("---------------------------")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
TEST 1 : 
Genre Expected :
	Comedy
	Romance
	School Life
	Slice of Life
Genre Predicted :
	Comedy
	Romance
---------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
TEST 2 : 
Genre Expected :
	Action
	Adventure
	Drama
	Fantasy
	Horror
	Philosophical
	Psychological
	Supernatural
	Tragedy
Genre Predicted :
	Action
	Adventure
	Comedy
	Drama
	Psychological
	Thriller
	Tragedy
---------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
TEST 3 : 
Genre Expected :
	Action
	Adventure
	Comedy
	Crime
	Drama
	Fantasy
	Sci-Fi
	Supernatural
Genre Predicted :
	Action
	Adventure
	Drama
	Fantasy
	Romance
---------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
TEST 4 : 
Genre Expected :
	Action
	Drama
	Mafia
	Martial Arts
	Mystery
	Psychological
	Thriller
Genre Predicted :
	Action
	Adventure
	Drama
	Fantasy
-

In [11]:
model.save('modelPredictGenre.keras')