In [1]:
import pandas as pd
import numpy as np

import pandas as pd
from collections import Counter

from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer

from peft import get_peft_model, LoraConfig, TaskType

import nltk
nltk.download('punkt')

import nltk
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lexil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lexil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text_description):
    text_description = text_description.lower()
    text_description = text_description.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text_description)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [3]:
allbooksprocessed = pd.read_csv('../data/allbooksprocessed.csv')
allbooksprocessed

Unnamed: 0,title,description,genre,published_date,authors,processed_description
0,the silver chair,two english children undergo hairraising adven...,fantasy,1998,clive staples lewis,two english child undergo hairraising adventur...
1,a game of thrones,fantasyroman,fantasy,2011,george r r martin,fantasyroman
2,fablehaven,when kendra and seth go to stay at their grand...,fantasy,2007,brandon mull,kendra seth go stay grandparent estate discove...
3,a wizard of earthsea,originally published in 1968 ursula k le guins...,fantasy,2012,ursula k le guin,originally published 1968 ursula k le guins wi...
4,lodestar,betrayed by one of their closest allies sophie...,fantasy,2017,shannon messenger,betrayed one closest ally sophies whole world ...
...,...,...,...,...,...,...
776,out of the everywhere,topics include astronomy humanity radiation ma...,science fiction,1990,isaac asimov,topic include astronomy humanity radiation mag...
777,quantum shorts,this book presents winning and shortlisted sto...,science fiction,2019,michael brooks jenny hogan puah xin yi,book present winning shortlisted story past ed...
778,novel science,novel science is the first indepth study of th...,science fiction,2013,adelene buckland,novel science first indepth study shocking gro...
779,fantastic voyages,by revealing the facts behind the fiction of s...,science fiction,2006,leroy w dubeck suzanne e moshier judith e boss,revealing fact behind fiction finest film scif...


In [4]:
X = allbooksprocessed['processed_description']
y = allbooksprocessed['genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [5]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [6]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [7]:
y_pred = model.predict(X_test_tfidf)

print("Classification Report:\n", classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
                     precision    recall  f1-score   support

           fantasy       0.85      0.63      0.72        27
historical fiction       0.78      0.28      0.41        25
           mystery       0.72      0.70      0.71        33
           romance       0.52      0.81      0.63        37
   science fiction       0.84      0.91      0.88        35

          accuracy                           0.69       157
         macro avg       0.74      0.67      0.67       157
      weighted avg       0.73      0.69      0.68       157

Accuracy: 0.6942675159235668
Confusion Matrix:
 [[17  1  1  7  1]
 [ 2  7  1 14  1]
 [ 0  0 23  6  4]
 [ 0  1  6 30  0]
 [ 1  0  1  1 32]]


In [8]:
feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_

for i, genre in enumerate(model.classes_):
    top_features = np.argsort(coefs[i])[-10:]  # Top 10 features
    print(f"Top words in {genre}: {[feature_names[j] for j in top_features]}")

Top words in fantasy: ['world', 'narnia', 'six', 'edition', 'moomins', 'prince', 'magical', 'wizard', 'oz', 'adventure']
Top words in historical fiction: ['soon', 'tribe', 'love', 'author', 'woman', 'london', 'young', 'life', 'family', 'war']
Top words in mystery: ['miss', 'wife', 'nancy', 'crime', 'marple', 'killer', 'death', 'fantasyroman', 'mystery', 'murder']
Top words in romance: ['town', 'beach', 'shes', 'arrangement', 'york', 'price', 'romantic', 'bestselling', 'heart', 'love']
Top words in science fiction: ['form', 'explores', 'literature', 'space', 'literary', 'study', 'work', 'future', 'fiction', 'science']


In [9]:
vect = CountVectorizer(max_features=5000)
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

In [10]:
nb = MultinomialNB().fit(X_train_count, y_train)

y_pred = nb.predict(X_test_count)

In [11]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6369426751592356
[[11  0  1 11  4]
 [ 1  2  0 19  3]
 [ 0  0 22  7  4]
 [ 0  1  3 32  1]
 [ 1  0  1  0 33]]


In [12]:
feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_

for i, genre in enumerate(model.classes_):
    top_features = np.argsort(coefs[i])[-10:]  # Top 10 features
    print(f"Top words in {genre}: {[feature_names[j] for j in top_features]}")

Top words in fantasy: ['world', 'narnia', 'six', 'edition', 'moomins', 'prince', 'magical', 'wizard', 'oz', 'adventure']
Top words in historical fiction: ['soon', 'tribe', 'love', 'author', 'woman', 'london', 'young', 'life', 'family', 'war']
Top words in mystery: ['miss', 'wife', 'nancy', 'crime', 'marple', 'killer', 'death', 'fantasyroman', 'mystery', 'murder']
Top words in romance: ['town', 'beach', 'shes', 'arrangement', 'york', 'price', 'romantic', 'bestselling', 'heart', 'love']
Top words in science fiction: ['form', 'explores', 'literature', 'space', 'literary', 'study', 'work', 'future', 'fiction', 'science']


In [13]:
allbooksprocessed

Unnamed: 0,title,description,genre,published_date,authors,processed_description
0,the silver chair,two english children undergo hairraising adven...,fantasy,1998,clive staples lewis,two english child undergo hairraising adventur...
1,a game of thrones,fantasyroman,fantasy,2011,george r r martin,fantasyroman
2,fablehaven,when kendra and seth go to stay at their grand...,fantasy,2007,brandon mull,kendra seth go stay grandparent estate discove...
3,a wizard of earthsea,originally published in 1968 ursula k le guins...,fantasy,2012,ursula k le guin,originally published 1968 ursula k le guins wi...
4,lodestar,betrayed by one of their closest allies sophie...,fantasy,2017,shannon messenger,betrayed one closest ally sophies whole world ...
...,...,...,...,...,...,...
776,out of the everywhere,topics include astronomy humanity radiation ma...,science fiction,1990,isaac asimov,topic include astronomy humanity radiation mag...
777,quantum shorts,this book presents winning and shortlisted sto...,science fiction,2019,michael brooks jenny hogan puah xin yi,book present winning shortlisted story past ed...
778,novel science,novel science is the first indepth study of th...,science fiction,2013,adelene buckland,novel science first indepth study shocking gro...
779,fantastic voyages,by revealing the facts behind the fiction of s...,science fiction,2006,leroy w dubeck suzanne e moshier judith e boss,revealing fact behind fiction finest film scif...


In [14]:
allbooksprocessed = allbooksprocessed.rename(columns={"processed_description": "text", "genre": "label"})
allbooksprocessed['label'] = LabelEncoder().fit_transform(allbooksprocessed['label'])
allbooksprocessed


Unnamed: 0,title,description,label,published_date,authors,text
0,the silver chair,two english children undergo hairraising adven...,0,1998,clive staples lewis,two english child undergo hairraising adventur...
1,a game of thrones,fantasyroman,0,2011,george r r martin,fantasyroman
2,fablehaven,when kendra and seth go to stay at their grand...,0,2007,brandon mull,kendra seth go stay grandparent estate discove...
3,a wizard of earthsea,originally published in 1968 ursula k le guins...,0,2012,ursula k le guin,originally published 1968 ursula k le guins wi...
4,lodestar,betrayed by one of their closest allies sophie...,0,2017,shannon messenger,betrayed one closest ally sophies whole world ...
...,...,...,...,...,...,...
776,out of the everywhere,topics include astronomy humanity radiation ma...,4,1990,isaac asimov,topic include astronomy humanity radiation mag...
777,quantum shorts,this book presents winning and shortlisted sto...,4,2019,michael brooks jenny hogan puah xin yi,book present winning shortlisted story past ed...
778,novel science,novel science is the first indepth study of th...,4,2013,adelene buckland,novel science first indepth study shocking gro...
779,fantastic voyages,by revealing the facts behind the fiction of s...,4,2006,leroy w dubeck suzanne e moshier judith e boss,revealing fact behind fiction finest film scif...


In [15]:
num_labels = allbooksprocessed['label'].nunique()
num_labels


5

In [16]:
dataset = Dataset.from_pandas(allbooksprocessed)

In [17]:
dataset = dataset.train_test_split(test_size=0.2)

In [18]:
dataset['train']

Dataset({
    features: ['title', 'description', 'label', 'published_date', 'authors', 'text'],
    num_rows: 624
})

In [19]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [20]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [21]:
train_dataset = train_dataset.map(lambda df: tokenizer(df['text'], padding="max_length", truncation=True), batched=True)
test_dataset = test_dataset.map(lambda df: tokenizer(df['text'], padding="max_length", truncation=True), batched=True)

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/157 [00:00<?, ? examples/s]

In [22]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [23]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
training = TrainingArguments(num_train_epochs = 5,
                            weight_decay = 0.01,
                            report_to = 'none')

In [25]:
training_object = Trainer(
    model = model,
    args = training,
    train_dataset = train_dataset
)

In [None]:
training_object.train()



Step,Training Loss
