In [4]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
import unicodedata

import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import itertools
import tqdm

from pprint import pprint

import pyLDAvis
import pyLDAvis.gensim

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

  and should_run_async(code)
[nltk_data] Downloading package punkt to /Users/mavin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mavin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mavin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
df = pd.read_csv("./web_of_sicence_dataset/wof_ds_papers.csv")
df.head(5)

Unnamed: 0,Article Title,Abstract,Publication Date,Publication Year,DOI
0,Determinants of digital well-being,How can people lead fulfilling lives both than...,2024 SEP 20,2024,10.1007/s00146-024-02071-2
1,AI-based lumbar central canal stenosis classif...,ObjectivesThe assessment of lumbar central can...,2024 SEP 20,2024,10.1007/s00330-024-11080-0
2,"Analysis of Depression, Anxiety, Stress Scale ...",This study employs advanced data mining techni...,2024 SEP 19,2024,10.1111/ejed.12778
3,An algorithm for cattle counting in rangeland ...,To effectively address common issues such as c...,2024 SEP 19,2024,10.1049/ipr2.13240
4,Game Theoretic Approach Toward Detection of In...,Numerous researchers have scrutinized evasive ...,2024 SEP 19,2024,10.1002/spy2.467


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7842 entries, 0 to 7841
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Article Title     7842 non-null   object
 1   Abstract          7823 non-null   object
 2   Publication Date  7748 non-null   object
 3   Publication Year  7842 non-null   int64 
 4   DOI               7841 non-null   object
dtypes: int64(1), object(4)
memory usage: 306.5+ KB


### Text Processing

In [7]:
df = df.dropna()
df = df.drop_duplicates()

In [8]:
def normalize_text_for_bow(text, custom_stopwords=None):
    # Convert to lowercase
    text = text.lower()
    
    # Unicode normalization
    text = unicodedata.normalize('NFKD', text)
    
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    if custom_stopwords:
        stop_words.update(custom_stopwords)
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove short words (optional, adjust as needed)
    tokens = [token for token in tokens if len(token) > 2]
    
    return tokens

In [9]:
custom_stopwords = [
    # General research terms
    'study', 'research', 'paper', 'article', 'journal',
    'analysis', 'method', 'methodology', 'approach',
    'result', 'results', 'finding', 'findings',
    'conclusion', 'conclusions', 'discussion',
    'hypothesis', 'hypotheses', 'theory', 'theoretical',
    'experiment', 'experimental', 'observation', 'observations',

    # Data and statistics
    'data', 'dataset', 'sample', 'population',
    'variable', 'variables', 'factor', 'factors',
    'significant', 'significance', 'correlation',
    'analysis', 'regression', 'model', 'modeling',

    # Publication-related
    'abstract', 'introduction', 'background', 'literature',
    'review', 'method', 'methods', 'materials',
    'procedure', 'procedures', 'protocol', 'protocols',
    'published', 'unpublished', 'preprint',
    'journal', 'publication', 'doi', 'isbn',

    # Common academic phrases
    'et', 'al', 'ie', 'eg', 'cf',
    'respectively', 'thus', 'hence', 'therefore',
    'however', 'moreover', 'furthermore', 'additionally',

    # Time-related
    'year', 'month', 'week', 'day',
    'annual', 'quarterly', 'monthly', 'daily',

    # Measurement and quantity
    'measure', 'measurement', 'quantity', 'amount',
    'level', 'levels', 'degree', 'degrees',
    'rate', 'rates', 'percentage', 'percentages',

    # Generic descriptors
    'high', 'low', 'medium', 'average',
    'large', 'small', 'significant', 'insignificant',

    # Common verbs
    'show', 'shows', 'shown', 'indicate', 'indicates', 'indicated',
    'suggest', 'suggests', 'suggested', 'report', 'reports', 'reported',
    'observe', 'observes', 'observed', 'find', 'finds', 'found',
    'note', 'notes', 'noted', 'present', 'presents', 'presented',

    # Research process
    'collect', 'collected', 'analyze', 'analyzed',
    'investigate', 'investigated', 'examine', 'examined',
    'evaluate', 'evaluated', 'assess', 'assessed',

    # Miscellaneous
    'based', 'using', 'used', 'via', 'through',
    'within', 'between', 'among', 'across',
    'figure', 'table', 'equation', 'section'
]

In [10]:
# Combine Title and Abstract
df['combined_text'] = df['Article Title'] + ' ' + df['Abstract'].fillna('')



# Apply preprocessing to combined text (pass custom stopwords if needed)
df['normalized_tokens'] = df['combined_text'].apply(lambda x: normalize_text_for_bow(x, custom_stopwords))

### Create Bag of Words Representation

In [11]:
texts = df['normalized_tokens']

# Step 1: Create a dictionary representation of the documents
dictionary = Dictionary(texts)

In [12]:
# Step 2: Create a Bag of Words (BoW) corpus
corpus = [dictionary.doc2bow(text) for text in texts]

### LDA

In [13]:
# Build LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=20,
                     random_state=0,
                     chunksize=100,
                     alpha='asymmetric',
                     eta=0.1,
                     passes=20,
                     per_word_topics=True)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.055*"language" + 0.031*"text" + 0.018*"examination" + 0.014*"bias" + '
  '0.013*"emotional" + 0.013*"structured" + 0.012*"llm" + 0.010*"worker" + '
  '0.010*"semantic" + 0.009*"writing"'),
 (1,
  '0.028*"learning" + 0.016*"algorithm" + 0.016*"network" + 0.015*"machine" + '
  '0.012*"feature" + 0.012*"image" + 0.012*"performance" + 0.012*"deep" + '
  '0.010*"proposed" + 0.010*"accuracy"'),
 (2,
  '0.025*"gene" + 0.022*"cell" + 0.012*"drug" + 0.011*"expression" + '
  '0.011*"layer" + 0.010*"protein" + 0.010*"annotation" + 0.009*"breast" + '
  '0.009*"bayesian" + 0.009*"differential"'),
 (3,
  '0.020*"change" + 0.018*"climate" + 0.017*"specie" + 0.017*"mental" + '
  '0.011*"area" + 0.011*"environmental" + 0.010*"global" + 0.009*"nurse" + '
  '0.009*"might" + 0.009*"region"'),
 (4,
  '0.039*"patient" + 0.017*"disease" + 0.014*"risk" + 0.013*"treatment" + '
  '0.012*"clinical" + 0.011*"group" + 0.011*"diagnosis" + 0.010*"age" + '
  '0.010*"outcome" + 0.009*"health"'),
 (5,
  '0.02

In [14]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('nCoherence Score: ', coherence_lda)

nCoherence Score:  0.46715761228385977


In [15]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
p

  default_term_info = default_term_info.sort_values(
