**Data Preprocessing**

In [None]:
# import the necessary libraries

import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score
import re
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# import the dataset 

publication = pd.read_csv('eu_pub_publication.csv')
institution = pd.read_csv('eu_pub_institution.csv')

# drop uncessary columns
publication = publication.drop(columns=['period'])
institution = institution.drop(columns=['pubyear', 'quant_label', 'period'])
institution = institution[['pubid', 'eu_nuts_id']]

# drop duplicates
institution = institution.drop_duplicates()

# remove rows with missing values
institution = institution.dropna()

# merge the two datasets
df = pd.merge(publication, institution, on='pubid')
df.head()

  publication = pd.read_csv('eu_pub_publication.csv')
  institution = pd.read_csv('eu_pub_institution.csv')


Unnamed: 0,pubid,pubyear,quant_label,abstract,itemtitle,eu_nuts_id
0,2010.0,1998,,\r\n<p>1. The histamine H-2 receptor antagonis...,"Pharmacology of JB-9315, a new selective hista...",ES415
1,2012.0,1998,,\r\n<p>1. We have investigated the ability of ...,Focal cerebral ischemia in the mouse: Hypother...,NL327
2,2013.0,1998,,\r\n<p>1. For several years we have been worki...,Pyridazine derivatives XIV. Study of the vasor...,ES111
3,2015.0,1998,,\r\n<p>1. The structural and ionic requirement...,Structural requirements and ionic mechanism of...,UKJ36
4,2019.0,1998,,\r\n<p>1. The effect of taxol on selected lyso...,Activity of lysosomal system in mouse liver af...,PL331


In [4]:
# slice the data to only include data from 2010 to 2021
df = df[(df['pubyear'] >= 2010) & (df['pubyear'] <= 2021)]
df['pubyear'].unique()

# create 'period' column  
def assign_period(pubyear):
    if 2010 <= pubyear <= 2011:
        return 1
    elif 2012 <= pubyear <= 2013:
        return 2
    elif 2014 <= pubyear <= 2015:
        return 3
    elif 2016 <= pubyear <= 2017:
        return 4
    elif 2018 <= pubyear <= 2019:
        return 5
    elif 2020 <= pubyear <= 2021:
        return 6
    else:
        return None

df['period'] = df['pubyear'].apply(assign_period)
df.head()

Unnamed: 0,pubid,pubyear,quant_label,abstract,itemtitle,eu_nuts_id,period
5706114,7256411.0,2010,,"\r\n<p>Background: In the United Kingdom, heal...",The Formal Support Experiences of Mothers of A...,UKM25,1
5706115,7256419.0,2010,,\r\n<p>Background: Understanding how male nurs...,The Influence of Personality Traits and Social...,UKF14,1
5706116,7256433.0,2010,,\r\n<p>Background: Evidence suggests that olde...,The Experience of Applying a Narrative Researc...,UKN04,1
5706117,7256459.0,2010,,\r\n<p>Background: Sex work is receiving incre...,Sex work and the 2010 FIFA World Cup: time for...,BE234,1
5706118,7256460.0,2010,,\r\n<p>Sickle Cell Disorder is a global health...,Psychosocial impact of sickle cell disorder: p...,UKI72,1


In [5]:
# divide the dataset into quant and non-quant publications 
quant = df[df['quant_label'] == 'quant']
non_quant = df[df['quant_label'] != 'quant']

# remove '\r\n<p>' from the abstracts 
quant['abstract'] = quant['abstract'].str.replace('\r\n<p>', '')
non_quant['abstract'] = non_quant['abstract'].str.replace('\r\n<p>', '')

In [6]:
# group by region, count the number of unique 'pubid' 
quant_grouped = quant.groupby('eu_nuts_id').agg({'pubid': 'nunique'}).reset_index()
quant_grouped = quant_grouped.rename(columns={'pubid': 'count'})

# check the top 100 regions with the most publications
quant_grouped = quant_grouped.sort_values(by='count', ascending=False)

# slice 'quant' dataframe to only include the top 50 regions with the most publications
top_100 = quant_grouped.head(100)
top_100 = top_100['eu_nuts_id'].tolist()
quant_top_100 = quant[quant['eu_nuts_id'].isin(top_100)]
quant_top_100

Unnamed: 0,pubid,pubyear,quant_label,abstract,itemtitle,eu_nuts_id,period
5707039,7280031.0,2010,quant,In this paper we present a new formalism for q...,A new approach to modelling quantum distributi...,UKM34,1
5708613,7308386.0,2010,quant,In the paper it is shown that every physically...,Unification of Two Approaches to Quantum Logic...,PL633,1
5708693,7308665.0,2010,quant,This paper introduces a short survey on recent...,Notes on the Essential System to Acquire Infor...,ITI43,1
5708810,7319734.0,2010,quant,We experimentally demonstrate a detection sche...,Highly Efficient State-Selective Submicrosecon...,DE212,1
5708811,7319734.0,2010,quant,We experimentally demonstrate a detection sche...,Highly Efficient State-Selective Submicrosecon...,DE21H,1
...,...,...,...,...,...,...,...
17570341,46606846.0,2021,quant,Europium sulfide (EuS) thin films are appealin...,Ferromagnetic Europium Sulfide Thin Films: Inf...,DEA51,6
17570398,46606985.0,2021,quant,The discovery of superconductivity in the heav...,Magnetic reshuffling and feedback on supercond...,FR714,6
17570399,46606985.0,2021,quant,The discovery of superconductivity in the heav...,Magnetic reshuffling and feedback on supercond...,CZ010,6
17570400,46606985.0,2021,quant,The discovery of superconductivity in the heav...,Magnetic reshuffling and feedback on supercond...,FR623,6


In [7]:
# slice 'quant' dataframe to only include the top 50 regions with the most publications
top_100 = quant_grouped.head(100)
top_100 = top_100['eu_nuts_id'].tolist()
quant_top_100 = quant[quant['eu_nuts_id'].isin(top_100)]
non_quant_top_100 = non_quant[non_quant['eu_nuts_id'].isin(top_100)]

**Topic Modeling**

In [None]:
# custom stop words
custom_stop_words = ["quantum", "determining", "method", "includes", "based", "second", "using", 'showed', 'problem',
                     "study", "non", "entanglement", "qubit", # quantum-related words
                     "12", "14", "15", "2014", "iii", "ii", "10", "1a", "11", "cm", "60", '28', '250', '17',
                     "25", "40", "model", "2005", "ar", "rh", "kv", "sb", "ac", "www", "nm", 'srr', 'xy', 'sigma',
                     "mo", "nm", "18", "ee", "pb", "gev", "mu", "rh", "ln", "a15", "13", "ag", '55', 'effect','nd3',
                     "nm", "qds", "iv", "ag", "rms", "le", "p53", "pl", "fe", "ir", "4abn", "nmr", "ni", '05', '74',
                     "ta", "tio2", "thz", "db", "hz", "iso", "pss", "1310", "nu", "tf", "tf", '33', '350',
                     "km", "tev", "gev", "fb", "tl", "t2dm", "nlc", "dot", "si", "cb7", "mc", "soi", '100', 
                     "dots", "pi", "qd", "la", "cdw", "iqp", "ccl4", "no2", "mm", "p1", "bi", "ho", "rs", '00',
                     "qh", "tio2", "nu", "nir", "time", "bm12", "ssi", "u6", "current", "nmr", 'hs', 'sp',
                     "gev", "tot", "1h", "hh", "hi", "qds", "pss", "no2", "zb", "wz", "al", "ge", 'ff', 'der',
                     "ect", "mdd", "95", "girls", "uc", "cpe", "di", "uc", "er3", "new", 's190', 's208', '44',
                     "od", "ch3", "yb", "eu", "tb", "nr", "ope3", "4tp", "mu", "center", "la", '3b', 'eag1',
                     "number", "su", "ch4", "fe", "cr", "ni", "eat", "ws", "spp", "mn2", "mm", 'ss', 'oc',
                     "rf", "li", "2016", "16", "sic", "si", "von", "sept3", "0d", "36", 'igf1r', 'ma', 'mnc', ''
                     'o3', 'correctly', 'tb', 'te', 'il', 'cd39', 'cd73', 'au', 'ou', 'ml', 'mg', 'puo2', '298',
                     'sec', 'eta', 'ba122', 'al', 'cgm', 'h2s', 'ir', 'sni', '45', '75', 'tio2', '60', 'sp', '4000', 
                     'ge', 'si', '5p', '120', '2d', 'random', 'theory', 'li', 'qds', 'qd', 'p3ht', 'mu', '69', '71',
                     'p2', 'gw', '511', '3885', 'yb3', '25', 'sir', '240', 'ro', 't34', '10', '6p', 'bbb', 'c1', 'c3',
                     'nno', 'nlo', 'ag', 'sb', 'naf', 'ga', 'se', 'cu', 'gd', 'pf6', 'dots', 'dat', '4f', 
                     'ch3', 'center', 'centre', 'mc3t3', 'e1', '1064', 'jc', 'ga', 'cf3', 'bu', 'zn', 'near', 
                     'np', 'zn', 'cp', 'fr', 'nmc', 'nca', 'oco', 'dsb', 'ssb', 'oe', 'mn', '300', '80', 'b12x12',
                     'nh', 'br', 'mw', '77', 'ig', '24', 'sigma', '700', 'ij', '1h', 'approach', '4b', '343', 
                     's6', 'ce3', 'advantages', 'children', 'phase', '001', 'ad', 'dy3', '111', '2000', 'matter', '99',
                     'results', 'used', 'methods', 'analysis', 'used', 'associated', 'different', 'argue', 'article', 
                     'let', 'research', 'data', 'paper', 'examines', 'prove', 'proposed', 'samples', 'study', 'similar', 
                     'compared', 'significantly', 'high', 'publication', 'significant', 'use', 'studies', 'category', 
                     'use', 'field', 'group', 'years', '000', '2024', 'field', 'risk', 'non', 'age'
                     ]  

# add stop words 
default_stop_words = text.ENGLISH_STOP_WORDS

# combine the two stop words lists
all_stop_words = list(default_stop_words.union(custom_stop_words))

In [None]:
import pandas as pd
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from bertopic.vectorizers import ClassTfidfTransformer

# Create an empty list to store the results
all_results = []

# Iterate over each 'eu_nuts_id' and 'period', showing progress with tqdm
for nuts in tqdm(quant_top_100['eu_nuts_id'].unique(), desc="Processing NUTS regions"):
    for i in range(1, 7):
        # Create a filtered copy of the dataset for 'eu_nuts_id' and 'period'
        filtered_quant = quant_top_100[(quant_top_100['eu_nuts_id'] == nuts) & (quant_top_100['period'] == i)].copy()

        # Extract the 'abstract' column for BERTopic modeling, and ensure all entries are strings
        docs = filtered_quant['abstract'].dropna().astype(str).tolist()

        if len(docs) == 0:
            print(f"No valid documents for NUTS region {nuts} and period {i}")
            continue

        try:
            # Term Frequency, Inverse Document Frequency & transform docs into vectors
            vectorizer_model = CountVectorizer(stop_words=all_stop_words)
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
            embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
            embeddings = embedding_model.encode(docs, show_progress_bar=True)

            # Dynamically set the number of neighbors for UMAP based on the number of documents
            n_neighbors = min(5, len(docs) - 1)  # Ensures n_neighbors <= number of documents
            if n_neighbors <= 1:  # UMAP requires n_neighbors to be at least 2
                print(f"Skipping NUTS region {nuts} and period {i} due to insufficient documents.")
                continue

            umap_model = UMAP(n_neighbors=n_neighbors, min_dist=0.0, metric='cosine')

            # hyper parameter 설정, 최대 10개의 주제를, 최소 min_topic_sizes 보다 큰 클러스터로 생성 
            n_gram_ranges = (1, 1)
            min_topic_sizes = max(2, int(len(docs) * 0.015))  # 한 클러스터에 포함되는 최소 문서 수  (1, 1.5, 2, 3%)
            nr_topics_options = 10  # 최대 클러스터링 수 (5, 10, 15, 20개)

            topic_model = BERTopic(
                n_gram_range=n_gram_ranges,
                min_topic_size=min_topic_sizes,
                nr_topics=nr_topics_options,
                embedding_model=embedding_model,
                vectorizer_model=vectorizer_model,
                calculate_probabilities=True,
                ctfidf_model=ctfidf_model,
                umap_model=umap_model
            )

            topics, probs = topic_model.fit_transform(docs, embeddings)

            freq = topic_model.get_topic_info()

            # Add 'eu_nuts_id' and 'period' columns to the topic info DataFrame
            freq['eu_nuts_id'] = nuts
            freq['period'] = i

            # Append the result to the list
            all_results.append(freq)

            print(f"Processed topic information for NUTS region {nuts} and period {i}")

        except Exception as e:
            print(f"Error processing NUTS region {nuts} and period {i}: {e}")

# Combine all the results into a single DataFrame
quantum_results = pd.concat(all_results, ignore_index=True)

# Define a function to extract only the numbered labels
def extract_labels(text):
    # Use regex to extract only lines that start with a number followed by a period (e.g., "1. ")
    labels = re.findall(r'^\d+\.\s.+', text, re.MULTILINE)
    # Join the extracted labels back into a single string
    return "\n".join(labels)

# Apply the function to the 'content' column
quantum_results['content'] = quantum_results['content'].apply(extract_labels)

# Save the final combined DataFrame to a single CSV file
quantum_results.to_csv('quantum_pub_bertopic.csv', index=False)

In [None]:
# Create an empty list to store the results
all_results = []

# Iterate over each 'eu_nuts_id' and 'period', showing progress with tqdm
for nuts in tqdm(non_quant_top_100['eu_nuts_id'].unique(), desc="Processing NUTS regions"):
    for i in range(1, 7):
        # Create a filtered copy of the dataset for 'eu_nuts_id' and 'period'
        filtered_quant = non_quant_top_100[(non_quant_top_100['eu_nuts_id'] == nuts) & (non_quant_top_100['period'] == i)].copy()

        # Extract the 'abstract' column for BERTopic modeling, and ensure all entries are strings
        docs = filtered_quant['abstract'].dropna().astype(str).tolist()

        if len(docs) == 0:
            print(f"No valid documents for NUTS region {nuts} and period {i}")
            continue

        try:
            # Term Frequency, Inverse Document Frequency & transform docs into vectors
            vectorizer_model = CountVectorizer(stop_words=all_stop_words)
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
            embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
            embeddings = embedding_model.encode(docs, show_progress_bar=True)

            # Dynamically set the number of neighbors for UMAP based on the number of documents
            n_neighbors = min(10, len(docs) - 1)  # Ensures n_neighbors <= number of documents
            if n_neighbors <= 1:  # UMAP requires n_neighbors to be at least 2
                print(f"Skipping NUTS region {nuts} and period {i} due to insufficient documents.")
                continue

            umap_model = UMAP(n_neighbors=n_neighbors, min_dist=0.0, metric='cosine') # 0.0-0.3, a lower min_dist makes the clusters tighgter, whereas a higher value makes dispersed clusters

            # hyper parameter 설정, 최대 10개의 주제를, 최소 min_topic_sizes 보다 큰 클러스터로 생성 
            n_gram_ranges = (1, 1)
            min_topic_sizes = max(2, int(len(docs) * 0.01))  # 한 클러스터에 포함되는 최소 문서 수  (1, 1.5, 2, 3%)
            nr_topics_options = 40  # 최대 클러스터링 수 (5, 10, 15, 20개)

            # 문서별로 할당된 주제 topics와 확률 probs
            topic_model = BERTopic(
                n_gram_range=n_gram_ranges,
                min_topic_size=min_topic_sizes,
                nr_topics=nr_topics_options,
                embedding_model=embedding_model,
                vectorizer_model=vectorizer_model,
                calculate_probabilities=True,
                ctfidf_model=ctfidf_model,
                umap_model=umap_model
            )

            topics, probs = topic_model.fit_transform(docs, embeddings)

            # 결과를 저장
            freq = topic_model.get_topic_info()

            # Add 'eu_nuts_id' and 'period' columns to the topic info DataFrame
            freq['eu_nuts_id'] = nuts
            freq['period'] = i

            # Append the result to the list
            all_results.append(freq)

            print(f"Processed topic information for NUTS region {nuts} and period {i}")

        except Exception as e:
            print(f"Error processing NUTS region {nuts} and period {i}: {e}")

# Combine all the results into a single DataFrame
non_quant_results = pd.concat(all_results, ignore_index=True)

# Define a function to extract only the numbered labels
def extract_labels(text):
    # Use regex to extract only lines that start with a number followed by a period (e.g., "1. ")
    labels = re.findall(r'^\d+\.\s.+', text, re.MULTILINE)
    # Join the extracted labels back into a single string
    return "\n".join(labels)

# Apply the function to the 'content' column
non_quant_results['content'] = non_quant_results['content'].apply(extract_labels)

# Save the final DataFrame to a single CSV file
non_quant_results.to_csv('pub_bertopic.csv', index=False)