In [1]:
import pandas as pd
import pathlib2 as pathlib

In [2]:
cwd=pathlib.Path.cwd()
datapath=cwd.joinpath('data')
datawreviewsfile=datapath.joinpath('processed/innerJoinData.csv')
dataworeviewsfile=datapath.joinpath('raw/collaborative_book_metadata_with_genredummies.csv')

In [3]:
datadf=pd.read_csv(datawreviewsfile,sep=';')
fulldatadf=pd.read_csv(dataworeviewsfile,sep=';')

In [4]:
# datadf.head()

In [5]:
fulldatadf.head()

Unnamed: 0,book_id,title,image_url,url,num_pages,ratings_count,description,genre,name,book_id_mapping,...,graphic,historicalfiction,history,mystery,nonfiction,paranormal,poetry,romance,thriller,youngadult
0,5899779,Pride and Prejudice and Zombies Pride and Prej...,https://images.gr-assets.com/books/1320449653m...,https://www.goodreads.com/book/show/5899779-pr...,320,105537,The New York Times Best Seller is now a major ...,"['fantasy', 'paranormal', 'romance', 'fiction'...",Jane Austen,808,...,0,1,1,1,0,1,0,1,1,1
1,872333,Blue Bloods Blue Bloods 1,https://images.gr-assets.com/books/1322281515m...,https://www.goodreads.com/book/show/872333.Blu...,302,117633,"When the Mayflower set sail in 1620, it carrie...","['youngadult', 'fantasy', 'paranormal', 'roman...",Melissa de la Cruz,217,...,0,0,0,1,0,1,0,1,1,1
2,15507958,Me Before You Me Before You 1,https://images.gr-assets.com/books/1357108762m...,https://www.goodreads.com/book/show/15507958-m...,369,609327,Louisa Clark is an ordinary young woman living...,"['romance', 'fiction']",Jojo Moyes,385,...,0,0,0,0,0,0,0,1,0,0
3,66559,Sharp Objects,https://images.gr-assets.com/books/1423241485m...,https://www.goodreads.com/book/show/66559.Shar...,254,208394,"Fresh from a brief stay at a psych hospital, r...","['mystery', 'thriller', 'crime', 'fiction']",Gillian Flynn,192,...,0,0,0,1,0,0,0,0,1,0
4,7235533,The Way of Kings The Stormlight Archive 1,https://images.gr-assets.com/books/1507307887m...,https://www.goodreads.com/book/show/7235533-th...,1007,151473,"Speak again the ancient oaths,\nLife before de...","['fantasy', 'paranormal', 'fiction']",Brandon Sanderson,873,...,0,0,0,0,0,1,0,0,0,0


## group books via genres
- vectorize genre-list, countvectorizer vs tfidf (pos: fiction genre very prevalent, thus less relevant in distinction of books)
    - cluster genres (uneven size clusters? not very distinct? fuzzy clustering instead?)
    - topic modeling (lda)

## group books via blurb content
- vectorize, topics via topic modeling
-  tfidf -> lda
    - evaluate via topic coherence
- https://github.com/kapadias/medium-articles/blob/master/natural-language-processing/topic-modeling/Evaluate%20Topic%20Models.ipynb
- https://medium.com/@walter_sperat/using-optuna-with-sklearn-the-right-way-part-1-6b4ad0ab2451
- https://learn-scikit.oneoffcoder.com/optuna.html
- https://learn-scikit.oneoffcoder.com/gensim.html
- https://stackoverflow.com/questions/60613532/how-do-i-calculate-the-coherence-score-of-an-sklearn-lda-model
-


In [6]:
from optuna import Trial, create_study
from optuna.pruners import SuccessiveHalvingPruner
# import sklearn
from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import make_scorer

# from sklearn.model_selection import cross_val_score, KFold

from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.parsing.preprocessing import STOPWORDS

import numpy as np
import pandas as pd

import joblib
randomstate=313


### description
pipeline: tfidf, lda

- no crossvalidation in optuna when evaluating topics based on gensim coherence c_v, as scikitlearns make_scorer() only works with y_true to evaluate agains (only supervised clustering), this is not ideal, different 

In [7]:
#pipe = Pipeline([('tfidf',TfidfVectorizer()),('lda',LatentDirichletAllocation())])

In [8]:
def topiccoherencescorer(pipe,X):
    n_top_words = 15 #higher value for higher coherence, i.e. more word to make connections for coherence
    topics=pipe.named_steps.lda.components_
    texts=[[word for word in doc.split()] for doc in X]
    dictionary=corpora.Dictionary(texts)
    corpus=[dictionary.doc2bow(text) for text in texts]
    feature_names = [dictionary[i] for i in range(len(dictionary))]
    top_words = []
    for topic in topics:
        top_words.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    cm = CoherenceModel(topics=top_words, texts=texts, dictionary=dictionary, coherence='c_v')
    return cm.get_coherence()


In [9]:
def inst_tfidf(trial:Trial)->TfidfVectorizer:
    params={
        'norm':trial.suggest_categorical('norm',['l1','l2', None]),
        'smooth_idf':trial.suggest_categorical('smooth_idf',[True,False]),
        'sublinear_tf':trial.suggest_categorical('sublinear_tf',[True,False]),
        'stop_words':trial.suggest_categorical('stop_words',[None,'english',list(STOPWORDS)]),
        # 'max_df':trial.suggest_float('max_df',0,1),
        # 'min_df':trial.suggest_float('min_df',0,1),
        'max_features':trial.suggest_categorical('max_features',[None,300,150,100,50]) #can't use int, because of None
        }
    return TfidfVectorizer(**params)
def inst_lda(trial:Trial)->LatentDirichletAllocation:
    params={
        'learning_method':trial.suggest_categorical('learning_method',['batch','online']),
        'learning_decay':trial.suggest_float('learning_decay',0.5,0.9),
        'learning_offset':trial.suggest_float('learning_offset',2,20),
        'max_iter':trial.suggest_int('max_iter',5,20),
        'batch_size':trial.suggest_int('batch_size',5,128),
        'max_doc_update_iter':trial.suggest_int('max_doc_update_iter',0.001,0.1),
        'n_jobs':-1,
        'random_state':randomstate
    }
    return LatentDirichletAllocation(**params)
def inst_pipeTFLDA(trial:Trial)->Pipeline:
    pipeline=Pipeline([
        ('tfidf',inst_tfidf(trial)),
        ('lda',inst_lda(trial))
    ])
    return pipeline

def objective(trial:Trial,x:pd.DataFrame)->float:
    model=inst_pipeTFLDA(trial)
    
    pipe=model.fit(x)
    score=topiccoherencescorer(pipe,x)
    
    

    return score

In [10]:
study=create_study(study_name='description_tfidflda_study',direction='maximize',storage='sqlite:///description_tfidflda_study.db',load_if_exists=True) #TPESampler used as default, no pruning

[I 2024-08-27 19:54:01,221] A new study created in RDB with name: description_tfidflda_study


In [11]:
study.optimize(lambda trial: objective(trial,fulldatadf.description),n_trials=100,n_jobs=-1,show_progress_bar=True)

  0%|          | 0/100 [00:00<?, ?it/s]



[W 2024-08-27 19:54:03,444] Trial 1 failed with parameters: {'norm': 'l1', 'smooth_idf': False, 'sublinear_tf': False, 'stop_words': None, 'max_df': 0.3607755861867905, 'min_df': 0.8797550353659859, 'max_features': 150, 'learning_method': 'online', 'learning_decay': 0.8028828039153562, 'learning_offset': 19.97501518218227, 'max_iter': 20, 'batch_size': 98, 'max_doc_update_iter': 0} because of the following error: ValueError('max_df corresponds to < documents than min_df').
Traceback (most recent call last):
  File "c:\Users\lekle\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\lekle\AppData\Local\Temp\ipykernel_8584\968308381.py", line 1, in <lambda>
    study.optimize(lambda trial: objective(trial,fulldatadf.description),n_trials=100,n_jobs=-1,show_progress_bar=True)
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\lekle\AppData\

ValueError: max_df corresponds to < documents than min_df

In [None]:
joblib.dump(study,cwd.joinpath(f'study_{study.study_name}'))