In [1]:
import pandas as pd
import numpy as np
import re

import pandas as pd
from collections import Counter

from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer

from peft import get_peft_model, LoraConfig, TaskType
import gensim
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim_models
import pyLDAvis

import nltk
nltk.download('punkt')

import nltk
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lexil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lexil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text_description):
    text_description = text_description.lower()
    text_description = text_description.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text_description)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [3]:
allbooksprocessed = pd.read_csv('../data/allbooksprocessed.csv')
allbooksprocessed

Unnamed: 0,title,description,genre,published_date,authors,processed_description
0,the silver chair,two english children undergo hairraising adven...,fantasy,1998,clive staples lewis,two english child undergo hairraising adventur...
1,a game of thrones,fantasyroman,fantasy,2011,george r r martin,fantasyroman
2,fablehaven,when kendra and seth go to stay at their grand...,fantasy,2007,brandon mull,kendra seth go stay grandparent estate discove...
3,a wizard of earthsea,originally published in 1968 ursula k le guins...,fantasy,2012,ursula k le guin,originally published 1968 ursula k le guins wi...
4,lodestar,betrayed by one of their closest allies sophie...,fantasy,2017,shannon messenger,betrayed one closest ally sophies whole world ...
...,...,...,...,...,...,...
776,out of the everywhere,topics include astronomy humanity radiation ma...,science fiction,1990,isaac asimov,topic include astronomy humanity radiation mag...
777,quantum shorts,this book presents winning and shortlisted sto...,science fiction,2019,michael brooks jenny hogan puah xin yi,book present winning shortlisted story past ed...
778,novel science,novel science is the first indepth study of th...,science fiction,2013,adelene buckland,novel science first indepth study shocking gro...
779,fantastic voyages,by revealing the facts behind the fiction of s...,science fiction,2006,leroy w dubeck suzanne e moshier judith e boss,revealing fact behind fiction finest film scif...


In [4]:
X = allbooksprocessed['processed_description']
y = allbooksprocessed['genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [5]:
unigram_count_vectorizer = CountVectorizer()
unigram_model = make_pipeline(unigram_count_vectorizer, LogisticRegression(max_iter=1000))
unigram_model.fit(X_train, y_train)
y_pred_uni = unigram_model.predict(X_test)
print("Unigram Count Summary:\n", classification_report(y_test, y_pred_uni))
print(f'Accuracy: {accuracy_score(y_test, y_pred_uni)}')
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_uni))

bigram_count_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_model = make_pipeline(bigram_count_vectorizer, LogisticRegression(max_iter=1000))
bigram_model.fit(X_train, y_train)
y_pred_bi = bigram_model.predict(X_test)
print("Bigram Count Summary:\n", classification_report(y_test, y_pred_bi))
print(f'Accuracy: {accuracy_score(y_test, y_pred_bi)}')
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bi))

Unigram Count Summary:
                     precision    recall  f1-score   support

           fantasy       0.55      0.63      0.59        27
historical fiction       0.64      0.36      0.46        25
           mystery       0.70      0.64      0.67        33
           romance       0.57      0.70      0.63        37
   science fiction       0.78      0.80      0.79        35

          accuracy                           0.64       157
         macro avg       0.65      0.63      0.63       157
      weighted avg       0.65      0.64      0.64       157

Accuracy: 0.643312101910828
Confusion Matrix:
 [[17  3  1  5  1]
 [ 5  9  2  6  3]
 [ 2  0 21  6  4]
 [ 4  2  5 26  0]
 [ 3  0  1  3 28]]
Bigram Count Summary:
                     precision    recall  f1-score   support

           fantasy       0.41      0.33      0.37        27
historical fiction       0.50      0.04      0.07        25
           mystery       0.28      0.70      0.40        33
           romance       0.46  

In [None]:
count_pipeline = make_pipeline(
    CountVectorizer(ngram_range=(1, 2)),  # unigrams and bigrams for count vect
    LogisticRegression(max_iter=1000)
)

param_grid = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__max_iter': [500, 1000, 1500]
}

grid_search = GridSearchCV(count_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_count_pipeline = grid_search.best_estimator_

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

y_pred = grid_search.predict(X_test)
print("Test set classification report:\n", classification_report(y_test, y_pred))
print("Test set accuracy:", accuracy_score(y_test, y_pred))
print("Test set confusion matrix:\n", confusion_matrix(y_test, y_pred))

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best parameters: {'logisticregression__C': 0.1, 'logisticregression__max_iter': 500}
Best cross-validation accuracy: 0.6521806451612903
Test set classification report:
                     precision    recall  f1-score   support

           fantasy       0.57      0.63      0.60        27
historical fiction       0.73      0.32      0.44        25
           mystery       0.68      0.70      0.69        33
           romance       0.54      0.73      0.62        37
   science fiction       0.88      0.80      0.84        35

          accuracy                           0.66       157
         macro avg       0.68      0.64      0.64       157
      weighted avg       0.68      0.66      0.65       157

Test set accuracy: 0.6560509554140127
Test set confusion matrix:
 [[17  2  1  6  1]
 [ 4  8  3  9  1]
 [ 2  0 23  6  2]
 [ 3  1  6 27  0]
 [ 4  0  1  2 28]]


In [7]:
vectorizer = best_count_pipeline.named_steps['countvectorizer']
classifier = best_count_pipeline.named_steps['logisticregression']

feature_names = vectorizer.get_feature_names_out()

num_top_features = 10  
class_labels = classifier.classes_

for i, class_label in enumerate(class_labels):
    coefs = classifier.coef_[i]
    top_indices = np.argsort(coefs)[-num_top_features:]
    print(f"\nTop features for class '{class_label}':")
    for idx in reversed(top_indices):
        print(f"  {feature_names[idx]}: {coefs[idx]:.4f}")


Top features for class 'fantasy':
  adventure: 0.3813
  world: 0.2247
  prince: 0.1969
  magical: 0.1843
  edition: 0.1827
  wizard: 0.1801
  moomins: 0.1661
  oz: 0.1615
  six: 0.1608
  evil: 0.1502

Top features for class 'historical fiction':
  war: 0.2825
  young: 0.2186
  life: 0.2179
  soon: 0.1902
  author: 0.1870
  london: 0.1841
  love: 0.1837
  family: 0.1704
  tribe: 0.1575
  marriage: 0.1397

Top features for class 'mystery':
  murder: 0.5240
  mystery: 0.3848
  death: 0.2427
  wife: 0.2132
  killer: 0.2107
  old: 0.1992
  nancy: 0.1942
  marple: 0.1811
  miss: 0.1770
  crime: 0.1726

Top features for class 'romance':
  love: 0.4142
  heart: 0.2794
  bestselling: 0.2039
  text: 0.1806
  relationship: 0.1679
  romantic: 0.1549
  mother: 0.1533
  price: 0.1518
  time bestselling: 0.1516
  sister: 0.1458

Top features for class 'science fiction':
  fiction: 0.6083
  science: 0.5391
  work: 0.2735
  book: 0.2320
  study: 0.2126
  literary: 0.2100
  future: 0.2082
  explores: 0

In [8]:
unigram_tfidf_vectorizer = TfidfVectorizer()
unigram_model = make_pipeline(unigram_tfidf_vectorizer, LogisticRegression(max_iter=1000))
unigram_model.fit(X_train, y_train)
y_pred_uni = unigram_model.predict(X_test)
print("Unigram TFIDF Summary:\n", classification_report(y_test, y_pred_uni))
print(f'Accuracy: {accuracy_score(y_test, y_pred_uni)}')
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_uni))

bigram_tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2))
bigram_model = make_pipeline(bigram_tfidf_vectorizer, LogisticRegression(max_iter=1000))
bigram_model.fit(X_train, y_train)
y_pred_bi = bigram_model.predict(X_test)
print("Bigram TFIDF Summary:\n", classification_report(y_test, y_pred_bi))
print(f'Accuracy: {accuracy_score(y_test, y_pred_bi)}')
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bi))


Unigram TFIDF Summary:
                     precision    recall  f1-score   support

           fantasy       0.84      0.59      0.70        27
historical fiction       0.67      0.16      0.26        25
           mystery       0.68      0.70      0.69        33
           romance       0.50      0.81      0.62        37
   science fiction       0.84      0.91      0.88        35

          accuracy                           0.67       157
         macro avg       0.71      0.63      0.63       157
      weighted avg       0.70      0.67      0.65       157

Accuracy: 0.6687898089171974
Confusion Matrix:
 [[16  1  1  8  1]
 [ 2  4  3 15  1]
 [ 0  0 23  6  4]
 [ 0  1  6 30  0]
 [ 1  0  1  1 32]]
Bigram TFIDF Summary:
                     precision    recall  f1-score   support

           fantasy       0.80      0.15      0.25        27
historical fiction       0.50      0.04      0.07        25
           mystery       0.60      0.27      0.38        33
           romance       0.35 

In [None]:
tfidf_pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2)),  # unigrams and bigrams for tfidf
    LogisticRegression(max_iter=1000)
)

param_grid = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__max_iter': [500, 1000, 1500]
}

grid_search = GridSearchCV(tfidf_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_count_pipeline = grid_search.best_estimator_

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

y_pred = grid_search.predict(X_test)
print("Test set classification report:\n", classification_report(y_test, y_pred))
print("Test set accuracy:", accuracy_score(y_test, y_pred))
print("Test set confusion matrix:\n", confusion_matrix(y_test, y_pred))

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best parameters: {'logisticregression__C': 10, 'logisticregression__max_iter': 500}
Best cross-validation accuracy: 0.6939096774193547
Test set classification report:
                     precision    recall  f1-score   support

           fantasy       0.77      0.63      0.69        27
historical fiction       0.68      0.52      0.59        25
           mystery       0.72      0.70      0.71        33
           romance       0.60      0.73      0.66        37
   science fiction       0.82      0.91      0.86        35

          accuracy                           0.71       157
         macro avg       0.72      0.70      0.70       157
      weighted avg       0.72      0.71      0.71       157

Test set accuracy: 0.7133757961783439
Test set confusion matrix:
 [[17  3  1  5  1]
 [ 2 13  2  6  2]
 [ 0  0 23  6  4]
 [ 2  3  5 27  0]
 [ 1  0  1  1 32]]


In [10]:
vectorizer = best_count_pipeline.named_steps['tfidfvectorizer']
classifier = best_count_pipeline.named_steps['logisticregression']

feature_names = vectorizer.get_feature_names_out()

num_top_features = 10  
class_labels = classifier.classes_

for i, class_label in enumerate(class_labels):
    coefs = classifier.coef_[i]
    top_indices = np.argsort(coefs)[-num_top_features:]
    print(f"\nTop features for class '{class_label}':")
    for idx in reversed(top_indices):
        print(f"  {feature_names[idx]}: {coefs[idx]:.4f}")


Top features for class 'fantasy':
  adventure: 3.0033
  oz: 1.9245
  wizard: 1.8062
  eighteen science: 1.7420
  prince: 1.7226
  magical: 1.6951
  moomins: 1.6166
  eighteen: 1.6002
  publisher description: 1.5910
  edition: 1.5572

Top features for class 'historical fiction':
  war: 2.2590
  family: 1.6721
  life: 1.6308
  young: 1.4651
  london: 1.4537
  woman: 1.3761
  love: 1.3640
  tribe: 1.2780
  author: 1.2405
  make: 1.1650

Top features for class 'mystery':
  murder: 3.8512
  mystery: 2.7435
  fantasyroman: 2.4254
  death: 1.8694
  killer: 1.7524
  marple: 1.6772
  nancy: 1.5984
  book: 1.5567
  wife: 1.4905
  miss marple: 1.4818

Top features for class 'romance':
  love: 3.1383
  heart: 2.0364
  bestselling: 1.5710
  romantic: 1.5150
  shes: 1.3683
  time bestselling: 1.3146
  sister: 1.2902
  bestselling author: 1.2578
  york time: 1.2398
  beach: 1.2282

Top features for class 'science fiction':
  science: 5.4166
  fiction: 5.1131
  future: 2.0265
  work: 1.9857
  study: 

In [11]:
allbooksprocessed

Unnamed: 0,title,description,genre,published_date,authors,processed_description
0,the silver chair,two english children undergo hairraising adven...,fantasy,1998,clive staples lewis,two english child undergo hairraising adventur...
1,a game of thrones,fantasyroman,fantasy,2011,george r r martin,fantasyroman
2,fablehaven,when kendra and seth go to stay at their grand...,fantasy,2007,brandon mull,kendra seth go stay grandparent estate discove...
3,a wizard of earthsea,originally published in 1968 ursula k le guins...,fantasy,2012,ursula k le guin,originally published 1968 ursula k le guins wi...
4,lodestar,betrayed by one of their closest allies sophie...,fantasy,2017,shannon messenger,betrayed one closest ally sophies whole world ...
...,...,...,...,...,...,...
776,out of the everywhere,topics include astronomy humanity radiation ma...,science fiction,1990,isaac asimov,topic include astronomy humanity radiation mag...
777,quantum shorts,this book presents winning and shortlisted sto...,science fiction,2019,michael brooks jenny hogan puah xin yi,book present winning shortlisted story past ed...
778,novel science,novel science is the first indepth study of th...,science fiction,2013,adelene buckland,novel science first indepth study shocking gro...
779,fantastic voyages,by revealing the facts behind the fiction of s...,science fiction,2006,leroy w dubeck suzanne e moshier judith e boss,revealing fact behind fiction finest film scif...


In [12]:
allbooksprocessed = allbooksprocessed.rename(columns={"processed_description": "text", "genre": "label"})
allbooksprocessed['label'] = LabelEncoder().fit_transform(allbooksprocessed['label'])
allbooksprocessed


Unnamed: 0,title,description,label,published_date,authors,text
0,the silver chair,two english children undergo hairraising adven...,0,1998,clive staples lewis,two english child undergo hairraising adventur...
1,a game of thrones,fantasyroman,0,2011,george r r martin,fantasyroman
2,fablehaven,when kendra and seth go to stay at their grand...,0,2007,brandon mull,kendra seth go stay grandparent estate discove...
3,a wizard of earthsea,originally published in 1968 ursula k le guins...,0,2012,ursula k le guin,originally published 1968 ursula k le guins wi...
4,lodestar,betrayed by one of their closest allies sophie...,0,2017,shannon messenger,betrayed one closest ally sophies whole world ...
...,...,...,...,...,...,...
776,out of the everywhere,topics include astronomy humanity radiation ma...,4,1990,isaac asimov,topic include astronomy humanity radiation mag...
777,quantum shorts,this book presents winning and shortlisted sto...,4,2019,michael brooks jenny hogan puah xin yi,book present winning shortlisted story past ed...
778,novel science,novel science is the first indepth study of th...,4,2013,adelene buckland,novel science first indepth study shocking gro...
779,fantastic voyages,by revealing the facts behind the fiction of s...,4,2006,leroy w dubeck suzanne e moshier judith e boss,revealing fact behind fiction finest film scif...


In [13]:
num_labels = allbooksprocessed['label'].nunique()
num_labels


5

In [14]:
dataset = Dataset.from_pandas(allbooksprocessed)

In [15]:
dataset = dataset.train_test_split(test_size=0.2)

In [16]:
dataset['train']

Dataset({
    features: ['title', 'description', 'label', 'published_date', 'authors', 'text'],
    num_rows: 624
})

In [17]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [18]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [19]:
train_dataset = train_dataset.map(lambda df: tokenizer(df['text'], padding="max_length", truncation=True), batched=True)
test_dataset = test_dataset.map(lambda df: tokenizer(df['text'], padding="max_length", truncation=True), batched=True)

  obj.co_lnotab,  # for < python 3.10 [not counted in args]


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

  obj.co_lnotab,  # for < python 3.10 [not counted in args]


Map:   0%|          | 0/157 [00:00<?, ? examples/s]

In [20]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [21]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
training = TrainingArguments(num_train_epochs = 5,
                            weight_decay = 0.01,
                            report_to = 'none')

In [23]:
training_object = Trainer(
    model = model,
    args = training,
    train_dataset = train_dataset
)

In [None]:
training_object.train()



Step,Training Loss


In [None]:
predicted = training_object.predict(test_dataset)

In [None]:
prediction_labels = predicted.predictions
true_labels = predicted.label_ids

In [None]:
preds = np.argmax(predicted.predictions, axis=-1)

In [None]:
print("Test set classification report:\n", classification_report(true_labels, preds))
print("Test set accuracy:", accuracy_score(true_labels, preds))
print("Test set confusion matrix:\n", confusion_matrix(true_labels, preds))

In [None]:
print(confusion_matrix(true_labels, preds))
print(classification_report(true_labels, preds))

Fantasy - pretty strong, pretty distinct words. 
Historical fiction - worst performancce at 52% possible crossover with things like love, war, family  
Mystery - good, maybe crossover? (crime/love could be getting things mixed with romance or historical fiction perhaps.)
Romance - Good 
Sci-Fi  - best - likely because the vocabulary is pretty distinct?

Next steps - try getting more books? 
attempt to look at bi-grams 
topic modeling --  pi - LDA -vis
LoRA ?
Jaccard similarity between genres?

In [None]:
def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [token for token in tokens if token.isalpha() and token not in stop_words]

allbooksprocessed['processed_text'] = allbooksprocessed['text'].apply(preprocess)

In [None]:
texts = allbooksprocessed['processed_text'].tolist()

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
num_topics = 5 

lda_model = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=2)

for idx, topic in lda_model.print_topics(num_topics):
    print(f"Topic {idx}: {topic}")

fantasy
historical fiction
mystery
romance
science fiction

In [None]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

pyLDAvis.display(vis)

In [None]:
print(vis.topic_info['Category'].tolist())

