In [2]:
import pathlib2 as pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

from nltk.corpus import stopwords
# import nltk
# nltk.download('stopwords')

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import tokenize

from langdetect import detect, DetectorFactory

import joblib
from optuna import Trial, create_study


random_state=313
DetectorFactory.seed = random_state

In [3]:
datapath=pathlib.Path.cwd().joinpath('lens_org_data_files')
datafiles=[x for x in datapath.iterdir()]
models_path=pathlib.Path.cwd().joinpath('models')

In [4]:
def clean_text(text):
    clean = re.sub('<jats:[^>]*>', '', text)  # Entfernt alle jats Tags
    return clean

In [5]:
dfs=[]
for file in datafiles:
    df=pd.read_csv(file)
    dfs.append(df)

data=pd.concat(dfs,ignore_index=True)
data=data.dropna(subset=['Title', 'Abstract'])

  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)


In [6]:
data.columns

Index(['Lens ID', 'Title', 'Date Published', 'Publication Year',
       'Publication Type', 'Source Title', 'ISSNs', 'Publisher',
       'Source Country', 'Author/s', 'Abstract', 'Volume', 'Issue Number',
       'Start Page', 'End Page', 'Fields of Study', 'Keywords', 'MeSH Terms',
       'Chemicals', 'Funding', 'Source URLs', 'External URL', 'PMID', 'DOI',
       'Microsoft Academic ID', 'PMCID', 'Citing Patents Count', 'References',
       'Citing Works Count', 'Is Open Access', 'Open Access License',
       'Open Access Colour'],
      dtype='object')

In [7]:
print(data.Title.isna().sum())
print(data.Abstract.isna().sum())

0
0


In [8]:
data.Abstract[0]

'<jats:p>Day-ahead solar irradiance forecasting is carried out using data from a tropical environment, Singapore. The performance of the weather research and forecasting (WRF) model is evaluated. We explore various combinations of physics configuration setups in the WRF model and propose a setup for the tropical regions. The WRF model is benchmarked using persistence and two seasonal time series models, namely, the exponential smoothing (ETS) and seasonal autoregressive integrated moving average (SARIMA) models. It is shown that the WRF model outperforms the SARIMA model and achieves accuracies comparable with persistence and ETS models. Persistence, ETS, and WRF models have relative root mean square errors (rRMSE) of about 55–57%. Furthermore, we find that by combining the forecasting outputs of WRF and ETS models, errors can be reduced to 49%.</jats:p>'

In [9]:
languages=[]
for x in data.Title:
    try:
        l=detect(x)
    except:
        l=False
    languages.append(l)
data['title_languages']=languages

languages=[]
for x in data.Abstract:
    try:
        l=detect(x)
    except:
        l=False
    languages.append(l)
data['abstract_languages']=languages


In [10]:
# data.title_languages.value_counts()

In [11]:
data=data[(data['title_languages']=='en') & (data['abstract_languages']=='en')]
len(data)

261492

In [12]:
data.Title.head(5)

0     Day-Ahead Solar Irradiance Forecasting in a Tr...
2     Assessment of a falling solid particle receive...
4     Boron-Doped Silicon Diatom Frustules as a Phot...
5     Renewable energy management through microgrid ...
11    Thermal and electrical analysis of a linear pa...
Name: Title, dtype: object

In [13]:
print(len(data))
print(len(data['Source Country'].unique()))
# data_sourceCountry_valueCounts=data['Source Country'].value_counts()
data_sourceCountry_valueCounts=data[data.groupby('Source Country')['Source Country'].transform('count')>1000]
print(len(data_sourceCountry_valueCounts['Source Country'].value_counts()))
data_sourceCountry_valueCounts['Source Country'].value_counts()

261492
81
8


Source Country
United Kingdom    44716
United States     35457
Germany           18078
Switzerland       12517
Netherlands       11864
China              1796
Egypt              1337
India              1116
Name: count, dtype: int64

In [14]:
data['Abstract']=data.Abstract.apply(clean_text)
data.to_csv("cleaned_data/cleaned_data.csv",sep=';')

In [15]:
data.Abstract.head()

0     Day-ahead solar irradiance forecasting is carr...
2     An advanced computational fluid dynamics (CFD)...
4     An effective solar-powered silicon device for ...
5     Abstract In this study, an isolated microgrid ...
11    Thermal and electrical analysis of a linear pa...
Name: Abstract, dtype: object

In [16]:
data=pd.read_csv("cleaned_data/cleaned_data.csv",sep=';')
data.head()

  data=pd.read_csv("cleaned_data/cleaned_data.csv",sep=';')


Unnamed: 0.1,Unnamed: 0,Lens ID,Title,Date Published,Publication Year,Publication Type,Source Title,ISSNs,Publisher,Source Country,...,Microsoft Academic ID,PMCID,Citing Patents Count,References,Citing Works Count,Is Open Access,Open Access License,Open Access Colour,title_languages,abstract_languages
0,0,000-008-268-360-004,Day-Ahead Solar Irradiance Forecasting in a Tr...,2015-07-27,2015,journal article,Journal of Solar Energy Engineering,01996231; 15288986,ASME International,United States,...,2131632891,,0,001-815-086-488-477; 007-447-979-011-061; 025-...,33,False,,,en,en
1,2,000-042-629-788-452,Assessment of a falling solid particle receive...,,2015,journal article,Solar Energy,0038092x,Elsevier BV,United Kingdom,...,2067658121,,0,002-994-869-371-657; 005-360-904-860-363; 008-...,62,True,,green,en,en
2,4,000-062-946-383-801,Boron-Doped Silicon Diatom Frustules as a Phot...,2015-07-30,2015,journal article,ACS applied materials & interfaces,19448252; 19448244,American Chemical Society (ACS),United States,...,2410807451,,0,004-049-858-705-806; 007-090-004-757-278; 011-...,26,False,,,en,en
3,5,000-063-768-273-718,Renewable energy management through microgrid ...,,2015,journal article,Energy Reports,23524847,Elsevier BV,United Kingdom,...,1193286937,,0,004-077-674-704-399; 005-081-834-054-355; 011-...,79,True,"CC BY, CC BY-NC-ND",gold,en,en
4,11,000-138-869-752-834,Thermal and electrical analysis of a linear pa...,2015-12-01,2015,,,,,,...,3210564331,,0,,0,False,,,en,en


In [17]:
stopwords=stopwords.words('english')
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords, min_df=5, max_df=0.7)
tfidf_vectors = tfidf_vectorizer.fit_transform(data['Title'])
tfidf_vectors.shape

(261492, 18843)

In [18]:
nmf_text_model = NMF(n_components=50, random_state=random_state) # n_components: number of topics
W_text_matrix = nmf_text_model.fit_transform(tfidf_vectors)
H_text_matrix = nmf_text_model.components_
np.info(W_text_matrix)

class:  ndarray
shape:  (261492, 50)
strides:  (400, 8)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x29b6d331040
byteorder:  little
byteswap:  False
type: float64


In [19]:
np.info(H_text_matrix) 

class:  ndarray
shape:  (50, 18843)
strides:  (8, 400)
itemsize:  8
aligned:  True
contiguous:  False
fortran:  True
data pointer: 0x29b0ed9b040
byteorder:  little
byteswap:  False
type: float64


In [20]:
def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))
            
display_topics(nmf_text_model, tfidf_vectorizer.get_feature_names_out())


Topic 00
  system (34.67)
  integrated (2.39)
  tracking (1.94)
  monitoring (1.79)
  heating (1.64)

Topic 01
  perovskite (14.46)
  cells (9.63)
  stability (1.81)
  stable (1.76)
  inverted (1.29)

Topic 02
  sub (35.23)
  co (2.06)
  tio (1.49)
  cu (1.07)
  se (0.96)

Topic 03
  energy (17.77)
  renewable (8.40)
  sources (2.29)
  harvesting (1.33)
  resources (0.94)

Topic 04
  heat (11.86)
  transfer (4.41)
  pump (2.38)
  air (1.97)
  experimental (1.66)

Topic 05
  power (26.96)
  plant (4.84)
  plants (3.10)
  maximum (2.73)
  point (2.42)

Topic 06
  based (28.61)
  iot (1.21)
  method (1.02)
  fuzzy (0.98)
  algorithm (0.97)

Topic 07
  photovoltaic (29.68)
  modules (2.49)
  module (1.86)
  panels (1.61)
  maximum (1.10)

Topic 08
  storage (26.52)
  energy (5.50)
  battery (5.45)
  conversion (1.36)
  optimal (1.35)

Topic 09
  efficient (18.11)
  highly (6.73)
  stable (4.77)
  polymer (1.04)
  engineering (0.78)

Topic 10
  grid (23.82)
  connected (11.70)
  inverter (

In [21]:
# Find out how “big” the topics are, i.e., how many documents could be assigned mainly to each topic. 
# This can be calculated using the document-topic matrix and 
# summing the individual topic contributions over all documents.
# Normalizing them with the total sum and multiplying by 100 gives a percentage value:

print(W_text_matrix.sum(axis=0)/W_text_matrix.sum()*100.0)

[0.87095357 2.62678808 2.23239244 5.36652753 1.07878288 3.16514028
 1.47759932 2.5604315  1.69553754 1.24198854 1.36224818 1.90235875
 2.59216159 1.72222155 1.29801461 1.69612975 1.07877677 1.7063228
 1.70342245 1.3692708  0.7727497  1.30684719 1.53211043 2.09078475
 1.09640296 1.23488685 2.57981434 1.47800615 1.29636349 1.61959611
 1.14839945 1.73870892 1.62590339 1.51848556 1.66359807 5.35555311
 2.21201659 1.17579536 3.12923051 1.65554742 2.93536575 2.67188662
 2.17743481 1.85612594 2.37852597 3.53116011 2.31022786 2.14771681
 2.48885462 2.52483222]


In [22]:
feature_names = tfidf_vectorizer.get_feature_names_out()
n_top_words = 5

# Dokument-Topic-Zuordnung anzeigen
# for doc_idx, topic_dist in enumerate(W_text_matrix):
#     print(f"Dokument {doc_idx}:")
#     print(" ".join([f"Topic {i}: {topic_dist[i]:.2f}" for i in np.argsort(topic_dist)[::-1]]))

In [23]:
# Gensim Dictionary und Corpus erstellen
texts = [doc.split() for doc in data.Title.to_list()]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Topics in Gensim-Format umwandeln
topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in H_text_matrix]

# Kohärenzmodell erstellen
coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Topic Kohärenz: {coherence_score:.2f}")

Topic Kohärenz: 0.73


In [24]:
# X = tfidf_vectors
# # Elbow-Methode
# errors = []
# for n_topics in range(1, 51):
#     nmf = NMF(n_components=n_topics, random_state=random_state)
#     W = nmf.fit_transform(X)
#     H = nmf.components_
#     reconstruction_error = nmf.reconstruction_err_
#     errors.append(reconstruction_error)

# plt.plot(range(1, 51), errors, marker='o')
# plt.xlabel('Anzahl der Topics')
# plt.ylabel('Rekonstruktionsfehler')
# plt.title('Elbow-Methode zur Bestimmung der optimalen Anzahl der Topics')
# plt.show()

In [None]:
# stopwords2=stopwords.words('english')
# stopwords3=list(STOPWORDS)

AttributeError: 'list' object has no attribute 'words'

In [26]:

studyname='topicmodeling_nmf_titles'
thismodelpath=models_path.joinpath(f'{studyname}')
try: 
    thismodelpath.mkdir(exist_ok=False)
except FileExistsError:
    print('Directory already exists')


def inst_tfidf(trial:Trial)->TfidfVectorizer:
    params={
        'stop_words':'english',
        'tokenizer':tokenize,
        # 'max_features':trial.suggest_categorical('max_features',[None,1000,5000,10000])
        'max_df':trial.suggest_float('max_df',0.5,1),
        'min_df':trial.suggest_int('min_df',50,100)
    }
    return TfidfVectorizer(**params)

def inst_nmf(trial:Trial)->NMF:
    params={
        'random_state':random_state,
        'n_components':trial.suggest_categorical('n_components',[25,50,100])
    }
    return NMF(**params)
def inst_pipe_nmf(trial:Trial)->Pipeline:
    pipeline=Pipeline([
        ('tfidf',inst_tfidf(trial)),
        ('nmf',inst_nmf(trial))
    ])
    return pipeline
def objective(trial:Trial,x:pd.DataFrame)->float:
    model=inst_pipe_nmf(trial)
    # print(type(model))
    fitmodel=model.fit(x)
    W_text_matrix = fitmodel.transform(x)
    H_text_matrix = fitmodel.named_steps['nmf'].components_
    feature_names = fitmodel.named_steps['tfidf'].get_feature_names_out()
    n_top_words = 5
    texts=[doc.split() for doc in x.to_list()]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in H_text_matrix]

    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return coherence_score


storage=thismodelpath.joinpath(f"{studyname}.db")
if storage.exists():
    storage.unlink()
else:
    print("No sqlite db found")

study=create_study(study_name=studyname,direction='maximize',storage=f'sqlite:///{storage.as_posix()}',load_if_exists=False) #TPESampler used as default, no pruning
study.optimize(lambda trial: objective(trial,data['Title']),n_trials=5,n_jobs=1,show_progress_bar=True)
joblib.dump(study,thismodelpath.joinpath(f'study_{study.study_name}.pkl').as_posix())

model=inst_pipe_nmf(study.best_trial)
fitpipe=model.fit(data['Title'])
joblib.dump(fitpipe,thismodelpath.joinpath(f'{studyname}.pkl').as_posix())

No sqlite db found


[I 2025-03-13 14:02:09,944] A new study created in RDB with name: topicmodeling_nmf_titles


  0%|          | 0/5 [00:00<?, ?it/s]



[I 2025-03-13 14:06:00,917] Trial 0 finished with value: 0.6397847363938021 and parameters: {'max_df': 0.650455351094064, 'min_df': 77, 'n_components': 100}. Best is trial 0 with value: 0.6397847363938021.




[I 2025-03-13 14:07:14,484] Trial 1 finished with value: 0.7192827811680982 and parameters: {'max_df': 0.766598237957762, 'min_df': 65, 'n_components': 25}. Best is trial 1 with value: 0.7192827811680982.




[I 2025-03-13 14:08:40,444] Trial 2 finished with value: 0.7325877899803318 and parameters: {'max_df': 0.5979761112443638, 'min_df': 93, 'n_components': 25}. Best is trial 2 with value: 0.7325877899803318.




[I 2025-03-13 14:11:27,259] Trial 3 finished with value: 0.6586225875876702 and parameters: {'max_df': 0.6182213394056657, 'min_df': 99, 'n_components': 100}. Best is trial 2 with value: 0.7325877899803318.




[I 2025-03-13 14:12:27,924] Trial 4 finished with value: 0.7253924349232928 and parameters: {'max_df': 0.7245133175751064, 'min_df': 76, 'n_components': 25}. Best is trial 2 with value: 0.7325877899803318.




['c:/Users/lekle/Projects_Code/HA_TextAnalytics/models/topicmodeling_nmf_titles/topicmodeling_nmf_titles.pkl']

In [27]:
studyname='topicmodeling_nmf_abstracts'
thismodelpath=models_path.joinpath(f'{studyname}')
try: 
    thismodelpath.mkdir(exist_ok=False)
except FileExistsError:
    print('Directory already exists')


def inst_tfidf(trial:Trial)->TfidfVectorizer:
    params={
        'stop_words': 'english',#trial.suggest_categorical('stop_words',['english',stopwords.words('english'),list(STOPWORDS)]),
        'tokenizer':tokenize,
        # 'max_features':trial.suggest_categorical('max_features',[None,1000,5000,10000])
        'max_df':trial.suggest_float('max_df',0.5,1),
        'min_df':trial.suggest_int('min_df',50,100)
    }
    return TfidfVectorizer(**params)

def inst_nmf(trial:Trial)->NMF:
    params={
        'random_state':random_state,
        'n_components':trial.suggest_categorical('n_components',[25,50,100])
    }
    return NMF(**params)
def inst_pipe_nmf(trial:Trial)->Pipeline:
    pipeline=Pipeline([
        ('tfidf',inst_tfidf(trial)),
        ('nmf',inst_nmf(trial))
    ])
    return pipeline

def objective(trial:Trial,x:pd.DataFrame)->float:
    model=inst_pipe_nmf(trial)
    fitmodel=model.fit(x)
    W_text_matrix = fitmodel.transform(x)
    H_text_matrix = fitmodel.named_steps['nmf'].components_
    feature_names = fitmodel.named_steps['tfidf'].get_feature_names_out()
    n_top_words = 5
    texts=[doc.split() for doc in x.to_list()]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in H_text_matrix]

    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return coherence_score


storage=thismodelpath.joinpath(f"{studyname}.db")
if storage.exists():
    storage.unlink()
else:
    print("No sqlite db found")

study=create_study(study_name=studyname,direction='maximize',storage=f'sqlite:///{storage.as_posix()}',load_if_exists=False) #TPESampler used as default, no pruning
study.optimize(lambda trial: objective(trial,data['Abstract']),n_trials=5,n_jobs=1,show_progress_bar=True)
joblib.dump(study,thismodelpath.joinpath(f'study_{study.study_name}.pkl').as_posix())

model=inst_pipe_nmf(study.best_trial)
fitpipe=model.fit(data['Abstract'])
joblib.dump(fitpipe,thismodelpath.joinpath(f'{studyname}.pkl').as_posix())

[I 2025-03-13 14:13:15,379] A new study created in RDB with name: topicmodeling_nmf_abstracts


No sqlite db found


  0%|          | 0/5 [00:00<?, ?it/s]



[I 2025-03-13 14:20:16,283] Trial 0 finished with value: 0.6354691778653848 and parameters: {'max_df': 0.6891507577945581, 'min_df': 95, 'n_components': 25}. Best is trial 0 with value: 0.6354691778653848.




[I 2025-03-13 14:26:01,819] Trial 1 finished with value: 0.6345680231546599 and parameters: {'max_df': 0.6734960742597685, 'min_df': 50, 'n_components': 25}. Best is trial 0 with value: 0.6354691778653848.




[I 2025-03-13 14:35:40,515] Trial 2 finished with value: 0.6891083840554937 and parameters: {'max_df': 0.7187419836776672, 'min_df': 69, 'n_components': 50}. Best is trial 2 with value: 0.6891083840554937.




[I 2025-03-13 14:43:19,241] Trial 3 finished with value: 0.6730490971147014 and parameters: {'max_df': 0.7910111282722723, 'min_df': 92, 'n_components': 50}. Best is trial 2 with value: 0.6891083840554937.




[I 2025-03-13 14:49:36,653] Trial 4 finished with value: 0.6451135532928111 and parameters: {'max_df': 0.5613341821704063, 'min_df': 88, 'n_components': 25}. Best is trial 2 with value: 0.6891083840554937.




['c:/Users/lekle/Projects_Code/HA_TextAnalytics/models/topicmodeling_nmf_abstracts/topicmodeling_nmf_abstracts.pkl']

# LSA

In [28]:

studyname='topicmodeling_lsa_titles'
thismodelpath=models_path.joinpath(f'{studyname}')
try: 
    thismodelpath.mkdir(exist_ok=False)
except FileExistsError:
    print('Directory already exists')


def inst_tfidf(trial:Trial)->TfidfVectorizer:
    params={
        'stop_words':'english',
        'tokenizer':tokenize,
        # 'max_features':trial.suggest_categorical('max_features',[None,1000,5000,10000])
        'max_df':trial.suggest_float('max_df',0.5,1),
        'min_df':trial.suggest_int('min_df',50,100)
    }
    return TfidfVectorizer(**params)

def inst_lsa(trial:Trial)->TruncatedSVD:
    params={
        'random_state':random_state,
        'n_components':trial.suggest_categorical('n_components',[25,50,100])
    }
    return TruncatedSVD(**params)
def inst_pipe_lsa(trial:Trial)->Pipeline:
    pipeline=Pipeline([
        ('tfidf',inst_tfidf(trial)),
        ('lsa',inst_lsa(trial))
    ])
    return pipeline
def objective(trial:Trial,x:pd.DataFrame)->float:
    model=inst_pipe_lsa(trial)
    # print(type(model))
    fitmodel=model.fit(x)
    W_text_matrix = fitmodel.transform(x)
    H_text_matrix = fitmodel.named_steps['lsa'].components_
    feature_names = fitmodel.named_steps['tfidf'].get_feature_names_out()
    n_top_words = 5
    texts=[doc.split() for doc in x.to_list()]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in H_text_matrix]

    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return coherence_score


storage=thismodelpath.joinpath(f"{studyname}.db")
if storage.exists():
    storage.unlink()
else:
    print("No sqlite db found")

study=create_study(study_name=studyname,direction='maximize',storage=f'sqlite:///{storage.as_posix()}',load_if_exists=False) #TPESampler used as default, no pruning
study.optimize(lambda trial: objective(trial,data['Title']),n_trials=5,n_jobs=1,show_progress_bar=True)
joblib.dump(study,thismodelpath.joinpath(f'study_{study.study_name}.pkl').as_posix())

model=inst_pipe_lsa(study.best_trial)
fitpipe=model.fit(data['Title'])
joblib.dump(fitpipe,thismodelpath.joinpath(f'{studyname}.pkl').as_posix())


studyname='topicmodeling_lsa_abstracts'
thismodelpath=models_path.joinpath(f'{studyname}')
try: 
    thismodelpath.mkdir(exist_ok=False)
except FileExistsError:
    print('Directory already exists')


def inst_tfidf(trial:Trial)->TfidfVectorizer:
    params={
        'stop_words': 'english',#trial.suggest_categorical('stop_words',['english',stopwords.words('english'),list(STOPWORDS)]),
        'tokenizer':tokenize,
        # 'max_features':trial.suggest_categorical('max_features',[None,1000,5000,10000])
        'max_df':trial.suggest_float('max_df',0.5,1),
        'min_df':trial.suggest_int('min_df',50,100)
    }
    return TfidfVectorizer(**params)

def inst_lsa(trial:Trial)->TruncatedSVD:
    params={
        'random_state':random_state,
        'n_components':trial.suggest_categorical('n_components',[25,50,100])
    }
    return TruncatedSVD(**params)
def inst_pipe_lsa(trial:Trial)->Pipeline:
    pipeline=Pipeline([
        ('tfidf',inst_tfidf(trial)),
        ('lsa',inst_lsa(trial))
    ])
    return pipeline

def objective(trial:Trial,x:pd.DataFrame)->float:
    model=inst_pipe_lsa(trial)
    fitmodel=model.fit(x)
    W_text_matrix = fitmodel.transform(x)
    H_text_matrix = fitmodel.named_steps['lsa'].components_
    feature_names = fitmodel.named_steps['tfidf'].get_feature_names_out()
    n_top_words = 5
    texts=[doc.split() for doc in x.to_list()]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in H_text_matrix]

    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return coherence_score


storage=thismodelpath.joinpath(f"{studyname}.db")
if storage.exists():
    storage.unlink()
else:
    print("No sqlite db found")

study=create_study(study_name=studyname,direction='maximize',storage=f'sqlite:///{storage.as_posix()}',load_if_exists=False) #TPESampler used as default, no pruning
study.optimize(lambda trial: objective(trial,data['Abstract']),n_trials=5,n_jobs=1,show_progress_bar=True)
joblib.dump(study,thismodelpath.joinpath(f'study_{study.study_name}.pkl').as_posix())

model=inst_pipe_lsa(study.best_trial)
fitpipe=model.fit(data['Abstract'])
joblib.dump(fitpipe,thismodelpath.joinpath(f'{studyname}.pkl').as_posix())

[I 2025-03-13 14:55:27,453] A new study created in RDB with name: topicmodeling_lsa_titles


No sqlite db found


  0%|          | 0/5 [00:00<?, ?it/s]



[I 2025-03-13 14:56:14,566] Trial 0 finished with value: 0.5047244113156633 and parameters: {'max_df': 0.6783536855114765, 'min_df': 96, 'n_components': 100}. Best is trial 0 with value: 0.5047244113156633.
[I 2025-03-13 14:56:58,195] Trial 1 finished with value: 0.5515840099930707 and parameters: {'max_df': 0.6463086974169041, 'min_df': 62, 'n_components': 50}. Best is trial 1 with value: 0.5515840099930707.
[I 2025-03-13 14:57:42,345] Trial 2 finished with value: 0.5449357231243231 and parameters: {'max_df': 0.5387949524522034, 'min_df': 52, 'n_components': 50}. Best is trial 1 with value: 0.5515840099930707.
[I 2025-03-13 14:58:26,605] Trial 3 finished with value: 0.5480249769383991 and parameters: {'max_df': 0.8508103793346755, 'min_df': 63, 'n_components': 50}. Best is trial 1 with value: 0.5515840099930707.
[I 2025-03-13 14:59:09,963] Trial 4 finished with value: 0.6100544446426033 and parameters: {'max_df': 0.934718896433256, 'min_df': 96, 'n_components': 25}. Best is trial 4 wi

[I 2025-03-13 14:59:16,577] A new study created in RDB with name: topicmodeling_lsa_abstracts


No sqlite db found


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-03-13 15:03:55,193] Trial 0 finished with value: 0.4119163491540576 and parameters: {'max_df': 0.8747774290530379, 'min_df': 74, 'n_components': 100}. Best is trial 0 with value: 0.4119163491540576.
[I 2025-03-13 15:08:32,764] Trial 1 finished with value: 0.4273542777048634 and parameters: {'max_df': 0.6940911864504777, 'min_df': 74, 'n_components': 100}. Best is trial 1 with value: 0.4273542777048634.
[I 2025-03-13 15:13:03,952] Trial 2 finished with value: 0.4214871589124367 and parameters: {'max_df': 0.9197644916836246, 'min_df': 54, 'n_components': 50}. Best is trial 1 with value: 0.4273542777048634.
[I 2025-03-13 15:17:37,991] Trial 3 finished with value: 0.43069358848556516 and parameters: {'max_df': 0.8348404247092509, 'min_df': 92, 'n_components': 50}. Best is trial 3 with value: 0.43069358848556516.
[I 2025-03-13 15:22:04,816] Trial 4 finished with value: 0.44802853094494954 and parameters: {'max_df': 0.9992828269461216, 'min_df': 76, 'n_components': 25}. Best is trial

['c:/Users/lekle/Projects_Code/HA_TextAnalytics/models/topicmodeling_lsa_abstracts/topicmodeling_lsa_abstracts.pkl']

# LDA

In [29]:
studyname='topicmodeling_lda_titles'
thismodelpath=models_path.joinpath(f'{studyname}')
try: 
    thismodelpath.mkdir(exist_ok=False)
except FileExistsError:
    print('Directory already exists')


def inst_tfidf(trial:Trial)->TfidfVectorizer:
    params={
        'stop_words':'english',
        'tokenizer':tokenize,
        # 'max_features':trial.suggest_categorical('max_features',[None,1000,5000,10000])
        'max_df':trial.suggest_float('max_df',0.5,1),
        'min_df':trial.suggest_int('min_df',50,100)
    }
    return TfidfVectorizer(**params)

def inst_lda(trial:Trial)->LatentDirichletAllocation:
    params={
        'random_state':random_state,
        'n_components':trial.suggest_categorical('n_components',[25,50,100])
    }
    return LatentDirichletAllocation(**params)
def inst_pipe_lda(trial:Trial)->Pipeline:
    pipeline=Pipeline([
        ('tfidf',inst_tfidf(trial)),
        ('lda',inst_lda(trial))
    ])
    return pipeline
def objective(trial:Trial,x:pd.DataFrame)->float:
    model=inst_pipe_lda(trial)
    # print(type(model))
    fitmodel=model.fit(x)
    W_text_matrix = fitmodel.transform(x)
    H_text_matrix = fitmodel.named_steps['lda'].components_
    feature_names = fitmodel.named_steps['tfidf'].get_feature_names_out()
    n_top_words = 5
    texts=[doc.split() for doc in x.to_list()]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in H_text_matrix]

    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return coherence_score


storage=thismodelpath.joinpath(f"{studyname}.db")
if storage.exists():
    storage.unlink()
else:
    print("No sqlite db found")

study=create_study(study_name=studyname,direction='maximize',storage=f'sqlite:///{storage.as_posix()}',load_if_exists=False) #TPESampler used as default, no pruning
study.optimize(lambda trial: objective(trial,data['Title']),n_trials=5,n_jobs=1,show_progress_bar=True)
joblib.dump(study,thismodelpath.joinpath(f'study_{study.study_name}.pkl').as_posix())

model=inst_pipe_lda(study.best_trial)
fitpipe=model.fit(data['Title'])
joblib.dump(fitpipe,thismodelpath.joinpath(f'{studyname}.pkl').as_posix())


studyname='topicmodeling_lda_abstracts'
thismodelpath=models_path.joinpath(f'{studyname}')
try: 
    thismodelpath.mkdir(exist_ok=False)
except FileExistsError:
    print('Directory already exists')


def inst_tfidf(trial:Trial)->TfidfVectorizer:
    params={
        'stop_words': 'english',#trial.suggest_categorical('stop_words',['english',stopwords.words('english'),list(STOPWORDS)]),
        'tokenizer':tokenize,
        # 'max_features':trial.suggest_categorical('max_features',[None,1000,5000,10000])
        'max_df':trial.suggest_float('max_df',0.5,1),
        'min_df':trial.suggest_int('min_df',50,100)
    }
    return TfidfVectorizer(**params)

def inst_lda(trial:Trial)->LatentDirichletAllocation:
    params={
        'random_state':random_state,
        'n_components':trial.suggest_categorical('n_components',[5,10,25,50]) #100 runs into errors
    }
    return LatentDirichletAllocation(**params)
def inst_pipe_lda(trial:Trial)->Pipeline:
    pipeline=Pipeline([
        ('tfidf',inst_tfidf(trial)),
        ('lda',inst_lda(trial))
    ])
    return pipeline

def objective(trial:Trial,x:pd.DataFrame)->float:
    model=inst_pipe_lda(trial)
    fitmodel=model.fit(x)
    W_text_matrix = fitmodel.transform(x)
    H_text_matrix = fitmodel.named_steps['lda'].components_
    feature_names = fitmodel.named_steps['tfidf'].get_feature_names_out()
    n_top_words = 5
    texts=[doc.split() for doc in x.to_list()]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in H_text_matrix]

    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return coherence_score


storage=thismodelpath.joinpath(f"{studyname}.db")
if storage.exists():
    storage.unlink()
else:
    print("No sqlite db found")

study=create_study(study_name=studyname,direction='maximize',storage=f'sqlite:///{storage.as_posix()}',load_if_exists=False) #TPESampler used as default, no pruning
study.optimize(lambda trial: objective(trial,data['Abstract']),n_trials=5,n_jobs=1,show_progress_bar=True)
joblib.dump(study,thismodelpath.joinpath(f'study_{study.study_name}.pkl').as_posix())

model=inst_pipe_lda(study.best_trial)
fitpipe=model.fit(data['Abstract'])
joblib.dump(fitpipe,thismodelpath.joinpath(f'{studyname}.pkl').as_posix())

[I 2025-03-13 15:23:09,106] A new study created in RDB with name: topicmodeling_lda_titles


No sqlite db found


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-03-13 15:27:38,347] Trial 0 finished with value: 0.5987675005606843 and parameters: {'max_df': 0.7851732899444197, 'min_df': 52, 'n_components': 50}. Best is trial 0 with value: 0.5987675005606843.
[I 2025-03-13 15:32:24,967] Trial 1 finished with value: 0.4652117167236875 and parameters: {'max_df': 0.7404360734215389, 'min_df': 50, 'n_components': 100}. Best is trial 0 with value: 0.5987675005606843.
[I 2025-03-13 15:36:50,202] Trial 2 finished with value: 0.609262706609892 and parameters: {'max_df': 0.6731187135180453, 'min_df': 99, 'n_components': 50}. Best is trial 2 with value: 0.609262706609892.
[I 2025-03-13 15:41:35,635] Trial 3 finished with value: 0.48539497830996614 and parameters: {'max_df': 0.6335546893597586, 'min_df': 65, 'n_components': 100}. Best is trial 2 with value: 0.609262706609892.
[I 2025-03-13 15:46:14,389] Trial 4 finished with value: 0.4750799382484194 and parameters: {'max_df': 0.8832113098747669, 'min_df': 100, 'n_components': 100}. Best is trial 2 

[I 2025-03-13 15:49:43,206] A new study created in RDB with name: topicmodeling_lda_abstracts


No sqlite db found


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-03-13 16:01:54,583] Trial 0 finished with value: 0.6322281547317193 and parameters: {'max_df': 0.7006981379853587, 'min_df': 79, 'n_components': 25}. Best is trial 0 with value: 0.6322281547317193.
[I 2025-03-13 16:14:11,162] Trial 1 finished with value: 0.6361343498235509 and parameters: {'max_df': 0.6467333497441827, 'min_df': 80, 'n_components': 25}. Best is trial 1 with value: 0.6361343498235509.
[I 2025-03-13 16:28:22,359] Trial 2 finished with value: 0.6340936609497568 and parameters: {'max_df': 0.5772748838644186, 'min_df': 65, 'n_components': 50}. Best is trial 1 with value: 0.6361343498235509.
[I 2025-03-13 16:40:39,228] Trial 3 finished with value: 0.5684046594785218 and parameters: {'max_df': 0.765422897691731, 'min_df': 51, 'n_components': 25}. Best is trial 1 with value: 0.6361343498235509.
[I 2025-03-13 16:52:18,397] Trial 4 finished with value: 0.6417774208657946 and parameters: {'max_df': 0.9926183102962507, 'min_df': 73, 'n_components': 10}. Best is trial 4 wit

['c:/Users/lekle/Projects_Code/HA_TextAnalytics/models/topicmodeling_lda_abstracts/topicmodeling_lda_abstracts.pkl']