In [1]:
import pandas as pd
import re

In [2]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import glob
import PyPDF2

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Latifa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def extract_text_from_pdf(pdf_path):
    """
    Extract full text from a PDF file using PyPDF2.
    Returns a single string with all the text.
    """
    text_chunks = []
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text_chunks.append(page_text)
    return "\n".join(text_chunks)

In [5]:
# 1. Set the folder containing your PDFs
pdf_folder = "C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs"
    
# 2. Gather all PDF file paths
pdf_paths = glob.glob(os.path.join(pdf_folder, "*.pdf"))
    
# 3. Prepare a list to store data for the DataFrame
data = []
    
for pdf_path in pdf_paths:
    print(f"Reading: {pdf_path}")
    pdf_text = extract_text_from_pdf(pdf_path)
        
    # Create a record (dictionary) for each PDF
    data.append({
        "filename": os.path.basename(pdf_path),
        "full_text": pdf_text
    })
    
# 4. Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data)

Reading: C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs\0019.pdf
Reading: C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs\1-s2.0-S0167404822003972-main.pdf
Reading: C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs\1-s2.0-S016740482400172X-main.pdf
Reading: C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs\1-s2.0-S0306261922009850-main.pdf
Reading: C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs\1-s2.0-S0957417421003377-main.pdf
Reading: C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs\1-s2.0-S0957417422015044-main.pdf
Reading: C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs\17348-Article Text-20842-1-2-20210518.pdf
Reading: C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs\2108.03803v2.pdf
Reading: C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs\2205.09362v2.pdf
Reading: C:/Users/Latifa/Documents/Um6p/Survey/Survey/sci-scraper/PDFs\2212.02705v5.pdf
Reading: C:/Users/

In [6]:
print(f"DataFrame shape: {df.shape}")
df.head()

DataFrame shape: (30, 2)


Unnamed: 0,filename,full_text
0,0019.pdf,Decentralized Anomaly Detection in Cooperative...
1,1-s2.0-S0167404822003972-main.pdf,Computers & Security 124 (2023) 103005 \nCont...
2,1-s2.0-S016740482400172X-main.pdf,Computers & Security 142 (2024) 103871\nAvaila...
3,1-s2.0-S0306261922009850-main.pdf,Applied Energy 324 (2022) 119688\nAvailable on...
4,1-s2.0-S0957417421003377-main.pdf,Expert Systems With Applications 176 (2021) 11...


In [7]:
df['full_text'][1]

'Computers  & Security 124 (2023) 103005 \nContents  lists available  at ScienceDirect  \nComputers  & Security  \njournal  homepage:  www.elsevier.com/locate/cose  \nOne4All:  Manipulate  one agent  to poison  the cooperative  multi-agent  \nreinforcement  learning  \nHaibin  Zheng  a , b , Xiaohao  Li b , Jinyin  Chen a , b , Jianfeng  Dong  c , d , Yan Zhang  e , \nChangting  Lin e , f , ∗\na Institute of Cyberspace  Security, Zhejiang University  of Technology,  Hangzhou  310023, China \nb College of Information  Engineering,  Zhejiang University  of Technology,  Hangzhou  310023, China \nc College of Computer  and Information  Engineering,  Zhejiang Gongshang  University,  Hangzhou  310018, China \nd State Key Laboratory  of Information  Security, Institute of Information  Engineering,  Chinese Academy  of Sciences, Beijing 10 0 093, China \ne Binjiang Institute of Zhejiang University,  Hangzhou  310053, China \nf Zhejiang University,  Hangzhou  310 0 014, China \na r t i c l e i 

In [8]:
# Download stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

In [9]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords
    words = [word for word in text.split() if word not in stop_words]
    # Remove short words
    words = [word for word in words if len(word) > 2]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Join words back into a string
    return ' '.join(words)

In [10]:
texts = df['full_text'].tolist()

preprocessed_text = [preprocess_text(text) for text in texts]

In [11]:
from sentence_transformers import SentenceTransformer

# Embed the preprocessed texts
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(preprocessed_text, show_progress_bar=True)

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.95s/it]


In [12]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [13]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english") # Vectorize the sentences
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [15]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
}

In [16]:
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,
  ctfidf_model=ctfidf_model,

  # Hyperparameters
  nr_topics="auto",
  min_topic_size=10,
  top_n_words=10,
  verbose=True
)

topics, probs = topic_model.fit_transform(preprocessed_text, embeddings)

2025-01-07 19:48:14,559 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-07 19:48:19,763 - BERTopic - Dimensionality - Completed ✓
2025-01-07 19:48:19,763 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-07 19:48:19,765 - BERTopic - Cluster - Completed ✓
2025-01-07 19:48:19,766 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-01-07 19:48:23,099 - BERTopic - Representation - Completed ✓
2025-01-07 19:48:23,101 - BERTopic - Topic reduction - Reducing number of topics
2025-01-07 19:48:26,497 - BERTopic - Topic reduction - Reduced number of topics from 7 to 7


In [17]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,8,-1_cmarl_reward_attack_policy,"[cmarl, reward, attack, policy, robust, rmaac,...","[reinforcement, multiagent, agent, adversarial...","[cmarl, rmaac, agent, adversary, trigger, rein...",[computer security content list available scie...
1,0,7,0_message_communication_vector_neural,"[message, communication, vector, neural, outpu...","[multiagent, reinforcement, cooperative, agent...","[message, communication, agent, cooperative, l...",[emergence adversarial communication multiagen...
2,1,5,1_agent_set_time_problem,"[agent, set, time, problem, algorithm, model, ...","[multiagent, strategy, agent, learning, oppone...","[agent, algorithm, task, node, value, fig, pre...",[entropy article improved approach towards mul...
3,2,3,2_defense_defender_network_attacker,"[defense, defender, network, attacker, securit...","[attackdefense, reinforcement, multiagent, str...","[defender, security, game, strategy, ddpg, int...",[expert system application available online ma...
4,3,3,3_adversarial_attack_energy_subplay,"[adversarial, attack, energy, subplay, victim,...","[adversarial, attack, multiagent, reinforcemen...","[adversarial, attack, training, defence, adver...",[applied energy available online august elsevi...
5,4,2,4_ami_victim_influence_timestep,"[ami, victim, influence, timestep, attack, tra...","[adversarial, attacking, reinforcement, attack...","[ami, timestep, attack, adversarial, adversary...",[model method trainingtime attack cooperative ...
6,5,2,5_attack_ihs_attacker_poisoning,"[attack, ihs, attacker, poisoning, reward, lea...","[reinforcement, agent, strategy, reward, bandi...","[attack, ihs, poisoning, reward, target, strat...",[rewardpoisoning attack offline multiagent rei...


In [18]:
topic_labels = topic_model.generate_topic_labels(nr_words=5,
                                                 topic_prefix=False,
                                                 word_length=30,
                                                 separator=", ")

topic_model.set_topic_labels(topic_labels)

In [19]:
topic_df = topic_model.get_topic_info()
topic_df 

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,MMR,Representative_Docs
0,-1,8,-1_cmarl_reward_attack_policy,"cmarl, reward, attack, policy, robust","[cmarl, reward, attack, policy, robust, rmaac,...","[reinforcement, multiagent, agent, adversarial...","[cmarl, rmaac, agent, adversary, trigger, rein...",[computer security content list available scie...
1,0,7,0_message_communication_vector_neural,"message, communication, vector, neural, output","[message, communication, vector, neural, outpu...","[multiagent, reinforcement, cooperative, agent...","[message, communication, agent, cooperative, l...",[emergence adversarial communication multiagen...
2,1,5,1_agent_set_time_problem,"agent, set, time, problem, algorithm","[agent, set, time, problem, algorithm, model, ...","[multiagent, strategy, agent, learning, oppone...","[agent, algorithm, task, node, value, fig, pre...",[entropy article improved approach towards mul...
3,2,3,2_defense_defender_network_attacker,"defense, defender, network, attacker, security","[defense, defender, network, attacker, securit...","[attackdefense, reinforcement, multiagent, str...","[defender, security, game, strategy, ddpg, int...",[expert system application available online ma...
4,3,3,3_adversarial_attack_energy_subplay,"adversarial, attack, energy, subplay, victim","[adversarial, attack, energy, subplay, victim,...","[adversarial, attack, multiagent, reinforcemen...","[adversarial, attack, training, defence, adver...",[applied energy available online august elsevi...
5,4,2,4_ami_victim_influence_timestep,"ami, victim, influence, timestep, attack","[ami, victim, influence, timestep, attack, tra...","[adversarial, attacking, reinforcement, attack...","[ami, timestep, attack, adversarial, adversary...",[model method trainingtime attack cooperative ...
6,5,2,5_attack_ihs_attacker_poisoning,"attack, ihs, attacker, poisoning, reward","[attack, ihs, attacker, poisoning, reward, lea...","[reinforcement, agent, strategy, reward, bandi...","[attack, ihs, poisoning, reward, target, strat...",[rewardpoisoning attack offline multiagent rei...


In [20]:
topic_model.get_topic(2, full=True)

{'Main': [('defense', np.float64(0.6020999085855888)),
  ('defender', np.float64(0.49594668246594026)),
  ('network', np.float64(0.4734844034813787)),
  ('attacker', np.float64(0.45157013123164974)),
  ('security', np.float64(0.40680688727886816)),
  ('game', np.float64(0.4000879563415393)),
  ('strategy', np.float64(0.38674017251154263)),
  ('mobile', np.float64(0.3809066185318129)),
  ('ddpg', np.float64(0.37136804015942737)),
  ('learning', np.float64(0.37039295858061166))],
 'KeyBERT': [('attackdefense', np.float32(0.40934637)),
  ('reinforcement', np.float32(0.35304734)),
  ('multiagent', np.float32(0.35108668)),
  ('strategy', np.float32(0.3316946)),
  ('defender', np.float32(0.31340653)),
  ('security', np.float32(0.3043015)),
  ('ddpg', np.float32(0.289494)),
  ('attack', np.float32(0.28894868)),
  ('hypergame', np.float32(0.28670627)),
  ('agent', np.float32(0.2699737))],
 'MMR': [('defender', np.float64(0.49594668246594026)),
  ('security', np.float64(0.40680688727886816)),
 