In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [55]:
df=pd.read_csv("/kaggle/input/mangaverse/manga.csv")
df=df.drop(columns=["Unnamed: 0","id","sub_title","poster_path","authors"])

In [56]:
df.sample()

Unnamed: 0,title,status,summary,type,genres,nsfw,total_chapter
416,MookHyang - Dark Lady,ongoing,As the master of the School of Unholy Arts and...,korea,"['Action', 'Fantasy', 'Martial Arts', 'Adventu...",True,211


In [57]:
df.nunique()

title            685
status             5
summary          674
type               3
genres           529
nsfw               2
total_chapter    223
dtype: int64

In [58]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [59]:
def data_preprocessing(text):
    if isinstance(text,str):
        da=text.lower()
        da_re=re.sub(r'\W+',' ',da)
        da_re_tokenize=word_tokenize(da_re)
        tokens=[word for word in da_re_tokenize if word not in stopwords.words('english')]
        token=" ".join(tokens)
        return token
    else:
        
        return " "

In [60]:
df["genres"]=df["genres"].apply(data_preprocessing)

In [61]:
df["title"]=df["title"].apply(data_preprocessing)

In [62]:
df["summary"]=df["summary"].apply(data_preprocessing)

In [63]:
df.dtypes

title            object
status           object
summary          object
type             object
genres           object
nsfw               bool
total_chapter     int64
dtype: object

In [64]:
one_hot=OneHotEncoder(sparse_output=False)
one=one_hot.fit_transform(df[["status","nsfw"]])
one_df=pd.DataFrame(one,columns=one_hot.get_feature_names_out())
one_df["total_chapter"]=df["total_chapter"]

In [65]:
# one_df

In [66]:
df['nsfw'].value_counts()

nsfw
False    538
True     162
Name: count, dtype: int64

In [67]:
df["type"].value_counts()

type
korea    273
japan    260
china    167
Name: count, dtype: int64

In [68]:
df

Unnamed: 0,title,status,summary,type,genres,nsfw,total_chapter
0,spirit sword master,ongoing,spirit sword roars within heaven earth three w...,china,action fantasy martial arts,False,455
1,release witch,ongoing,male engineer transmigrated another world beca...,china,action fantasy harem drama supernatural advent...,False,539
2,ultimate scheming system,ongoing,scamming day every day often venturing alone b...,china,action fantasy martial arts adventure isekai,False,638
3,martial god asura,ongoing,one night mysterious unexplained phenomenon oc...,china,action harem romance martial arts adventure,False,808
4,dragon body,ongoing,introducing generation celestial masters retur...,china,action school life martial arts,False,636
...,...,...,...,...,...,...,...
695,isekai kenja tensei musou geemu chishiki de is...,ongoing,young man unfortunately died modern era reinca...,japan,action fantasy harem drama comedy adventure is...,False,0
696,kaiko sareta ankoku heishi 30 dai slow na seco...,ongoing,dark soldier dariel 30 years old use magic dem...,japan,action fantasy romance comedy slice life adven...,False,0
697,legendary moonlight sculptor,,man forsaken world man slave money man known l...,korea,action fantasy romance drama comedy slice life...,False,0
698,fullmetal alchemist,Completed,humankind gain anything without first giving s...,japan,action fantasy manga shounen drama supernatura...,False,0


In [69]:
tf_df=[]
tf_feature_name=[]
for col in ["summary","genres"]:
    tac=df[col].astype(str).tolist()
    tfidf= TfidfVectorizer()
    tf_tf=tfidf.fit_transform(tac)
    tf_df.append(tf_tf.toarray())
    tf_feature_name.append(tfidf.get_feature_names_out())

In [70]:
# tf_df

In [71]:
tf_pd_df=[]
for i,values in enumerate(tf_df):
    tf_pd_df.append(pd.DataFrame(values,columns=tf_feature_name[i]))

In [72]:
concaty=pd.concat(tf_pd_df,axis=1).reset_index(drop=True)

In [73]:
concaty_2=pd.concat([concaty,one_df],axis=1)

In [74]:
label_encod=LabelEncoder()
Y=label_encod.fit_transform(df[["type"]])
# Y=

  y = column_or_1d(y, warn=True)


In [75]:
X_train,X_test,Y_train,Y_test=train_test_split(concaty_2,Y,test_size=0.2,random_state=42)

In [76]:
random_classif=RandomForestClassifier(n_estimators=50)
random_classif.fit(X_train,Y_train)
random_classif.score(X_test,Y_test)

0.7357142857142858

In [77]:
grad_classif=GradientBoostingClassifier()
grad_classif.fit(X_train,Y_train)
grad_classif.score(X_test,Y_test)

0.8571428571428571

In [78]:
svc_classif=SVC()
svc_classif.fit(X_train,Y_train)
svc_classif.score(X_test,Y_test)

0.5428571428571428

In [79]:
ada_classif=AdaBoostClassifier()
ada_classif.fit(X_train,Y_train)
ada_classif.score(X_test,Y_test)

0.7071428571428572

In [80]:
bagg_classif=BaggingClassifier()
bagg_classif.fit(X_train,Y_train)
bagg_classif.score(X_test,Y_test)

0.8071428571428572

# # Deep learning

In [96]:
from keras.layers import Dense,Input,Dropout,BatchNormalization
from keras.initializers import HeNormal
from keras.regularizers import l1_l2
from keras.models import Model
from keras.callbacks import EarlyStopping

In [82]:
X_train.shape

(560, 7225)

In [97]:
early_stopping=EarlyStopping(
monitor='val_loss',
patience=5,
verbose=1)

In [108]:
inputs=Input(shape=(7225,))
d=Dense(32,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(inputs)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(32,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)

d=Dense(128,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(128,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)
d=Dense(502,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(502,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(32,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(256,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(256,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)



d=Dense(128,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(128,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(32,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(32,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(502,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(502,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(128,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(128,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(256,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(256,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(32,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(502,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(502,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(32,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(32,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(128,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(128,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(502,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(502,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(256,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(256,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(1024,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dropout(0.5)(d)
d=BatchNormalization()(d)

d=Dense(32,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(64,activation="relu",kernel_initializer=HeNormal,kernel_regularizer=l1_l2(0.01))(d)
d=Dense(5,activation="softmax")(d)
outputs=Dropout(0.5)(d)

modelD15=Model(inputs,outputs)
modelD15.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=["accuracy"])

In [109]:
modelD15.fit(X_train,Y_train,epochs=50,batch_size=32,validation_data=(X_test,Y_test),callbacks=[early_stopping])

Epoch 1/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 215ms/step - accuracy: 0.1245 - loss: 3354.5344 - val_accuracy: 0.0000e+00 - val_loss: 3232.8518
Epoch 2/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 90ms/step - accuracy: 0.1305 - loss: 3207.6504 - val_accuracy: 0.0000e+00 - val_loss: 3101.1897
Epoch 3/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 89ms/step - accuracy: 0.1622 - loss: 3074.2502 - val_accuracy: 0.0000e+00 - val_loss: 2970.4426
Epoch 4/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 89ms/step - accuracy: 0.2249 - loss: 2944.6975 - val_accuracy: 0.4000 - val_loss: 2844.4465
Epoch 5/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 97ms/step - accuracy: 0.2444 - loss: 2821.2844 - val_accuracy: 0.4000 - val_loss: 2727.5732
Epoch 5: early stopping


<keras.src.callbacks.history.History at 0x79ff14ea25f0>

In [110]:
loss,accuracy=modelD15.evaluate(X_test,Y_test)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.4085 - loss: 2727.5518
