In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
import pickle

In [2]:
df = pd.read_csv('D:/YNOV/M1/NLP/Projet/DataSetMangaGenre.csv')
df = df.drop(columns=['title','status','demographic','content_rating'])

All check on data not being empty done during export

In [3]:
df.head()

Unnamed: 0,description,Action,Adventure,Boys' Love,Comedy,Crime,Drama,Fantasy,Girls' Love,Historical,...,Philosophical,Psychological,Romance,Sci-Fi,Slice of Life,Sports,Superhero,Thriller,Tragedy,Wuxia
0,Shinichi Kudo is a high school detective who s...,1,1,0,1,1,1,0,0,0,...,0,1,1,0,0,0,0,0,0,0
1,Takasu Ryuuji has learned the hard way that ap...,0,0,0,1,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,Rin and his exorcist classmates are caught in ...,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Yotsuba is a strange little girl with a big pe...,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,Welcome to a world where mysticism and science...,1,0,0,1,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0


In [4]:
def clean_text(text):
    text = str(text).lower()  
    text = re.sub(r"http\S+", "", text)  
    text = re.sub(r"[^\w\s]", "", text)
    return text


df['description'] = df['description'].apply(clean_text)

In [5]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(df['description'])

y = df.drop(columns=['description'])

In [6]:
y.columns

Index(['Action', 'Adventure', 'Boys' Love', 'Comedy', 'Crime', 'Drama',
       'Fantasy', 'Girls' Love', 'Historical', 'Horror', 'Isekai',
       'Magical Girls', 'Mecha', 'Medical', 'Mystery', 'Philosophical',
       'Psychological', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports',
       'Superhero', 'Thriller', 'Tragedy', 'Wuxia'],
      dtype='object')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver='saga', penalty='l2'))
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)

rep = classification_report(y_test, y_pred, target_names=y.columns,output_dict=True)
rep = pd.DataFrame(rep).transpose()
rep[['precision', 'recall', 'f1-score']] = rep[['precision', 'recall', 'f1-score']].round(2)
rep['support'] = rep['support'].astype(int)


print(rep[(rep['precision'] == 0)])

print('-----------------------------------------------')

print(rep[(rep['precision'] != 0)].sort_values(by='support', ascending=False))

               precision  recall  f1-score  support
Boys' Love           0.0     0.0       0.0       71
Crime                0.0     0.0       0.0       97
Magical Girls        0.0     0.0       0.0       23
Medical              0.0     0.0       0.0       20
Philosophical        0.0     0.0       0.0       67
Superhero            0.0     0.0       0.0       25
Wuxia                0.0     0.0       0.0       12
-----------------------------------------------
               precision  recall  f1-score  support
macro avg           0.59    0.18      0.22    10481
micro avg           0.71    0.43      0.53    10481
samples avg         0.67    0.48      0.52    10481
weighted avg        0.72    0.43      0.48    10481
Romance             0.73    0.76      0.74     1624
Comedy              0.66    0.67      0.67     1550
Drama               0.60    0.46      0.52     1364
Action              0.78    0.57      0.66      946
Slice of Life       0.68    0.26      0.37      860
Fantasy         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
test = [
    {
        "description": 'Komi-san is a beautiful and admirable girl that no one can take their eyes off of. Almost the whole school sees her as the cold beauty thats out of their league, but Tadano Hitohito knows the truth: shes just really bad at communicating with others. Komi-san, who wishes to fix this bad habit of hers, tries to improve it with the help of Tadano-kun by achieving her goal of having 100 friends.',
        "genres": ['Romance', 'Comedy', 'School Life', 'Slice of Life']
    },
    {
        "description": "Guts, known as the Black Swordsman, seeks sanctuary from the demonic forces attracted to him and his woman because of a demonic mark on their necks, and also vengeance against the man who branded him as an unholy sacrifice. Aided only by his titanic strength gained from a harsh childhood lived with mercenaries, a gigantic sword, and an iron prosthetic left hand, Guts must struggle against his bleak destiny, all the while fighting with a rage that might strip him of his humanity. Won the 6th Osamu Tezuka Cultural Prize Excellence Award in 2002.",
        "genres": ['Action', 'Psychological', 'Adventure', 'Philosophical', 'Drama', 'Horror', 'Fantasy', 'Supernatural', 'Tragedy']
    },
    {
        "description": "Gol D. Roger, a man referred to as the 'Pirate King,' is set to be executed by the World Government. But just before his demise, he confirms the existence of a great treasure, One Piece, located somewhere within the vast ocean known as the Grand Line. Announcing that One Piece can be claimed by anyone worthy enough to reach it, the Pirate King is executed and the Great Age of Pirates begins. Twenty-two years later, a young man by the name of Monkey D. Luffy is ready to embark on his own adventure, searching for One Piece and striving to become the new Pirate King. Armed with just a straw hat, a small boat, and an elastic body, he sets out on a fantastic journey to gather his own crew and a worthy ship that will take them across the Grand Line to claim the greatest status on the high seas.",
        "genres": ['Sci-Fi', 'Action', 'Comedy', 'Crime', 'Adventure', 'Drama', 'Fantasy', 'Supernatural']
    },
    {
        "description": "There are gamblers out there who even bet their lives as ante. But to secure the integrity of these life-threatening gambles, a violent and powerful organization by the name of “Kakerou” referees these games as a neutral party. Follow Baku Madarame a.k.a. Usogui (The Lie Eater) as he gambles against maniacal opponents at games – such as Escape the Abandoned Building, Old Maid, and Hangman – to ultimately “out-gamble” and control the neutral organization of Kakerou itself.",
        "genres": ['Thriller', 'Action', 'Psychological', 'Martial Arts', 'Mafia', 'Drama', 'Mystery']
    },
    #Description from outside Mangadex
    {
        "description": "The mysterious bug medicine ‘Jungle juice’ made him an insect human. Jang Su-chan, a college student who used to hide his teeth, one day he looked like that. You’ll be seen by everyone, and you’ll be in the insect world. You fight against the weak-kneed logic…",
        "genres": ['Thriller', 'Sci-Fi', 'Action', 'Superhero', 'Horror', 'Fantasy', 'Supernatural']
    },
    {
        "description": "The Murim Warring States period, is where only the strong survives. This is the war slave Cheong Gwang and Myeong-Wol's Murim Survival Story.",
        "genres": ['Action', 'Martial Arts', 'Adventure', 'Drama', 'Fantasy', 'Tragedy']
    }
]


for idx, val in enumerate(test) :
    valVec = vectorizer.transform([clean_text(val['description'])])

    pred = model.predict(valVec)

    print(f"TEST {idx + 1} : ")

    print("Genre Expected :")
    for genre in sorted(val['genres']) :
        print('\t' + genre)

    print("Genre Predicted :")
    for idx, genre in enumerate(y.columns):
        if pred[0][idx] == 1 :
            print('\t' + genre)
    print("---------------------------")


TEST 1 : 
Genre Expected :
	Comedy
	Romance
	School Life
	Slice of Life
Genre Predicted :
	Comedy
	Romance
---------------------------
TEST 2 : 
Genre Expected :
	Action
	Adventure
	Drama
	Fantasy
	Horror
	Philosophical
	Psychological
	Supernatural
	Tragedy
Genre Predicted :
	Action
	Adventure
	Drama
---------------------------
TEST 3 : 
Genre Expected :
	Action
	Adventure
	Comedy
	Crime
	Drama
	Fantasy
	Sci-Fi
	Supernatural
Genre Predicted :
	Action
	Adventure
	Comedy
	Drama
	Fantasy
---------------------------
TEST 4 : 
Genre Expected :
	Action
	Drama
	Mafia
	Martial Arts
	Mystery
	Psychological
	Thriller
Genre Predicted :
	Action
	Drama
---------------------------
TEST 5 : 
Genre Expected :
	Action
	Fantasy
	Horror
	Sci-Fi
	Superhero
	Supernatural
	Thriller
Genre Predicted :
	Comedy
---------------------------
TEST 6 : 
Genre Expected :
	Action
	Adventure
	Drama
	Fantasy
	Martial Arts
	Tragedy
Genre Predicted :
	Action
	Drama
---------------------------


In [21]:
with open('modelPredictGenre - TFIDF.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('tfidfVectorizer - TFIDF.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)