### Semantic Proximity Research
- Made by: Jisoo Hur (Ph.D.) & Keungoui Kim (Ph.D.)
- Goal: 05. BERT-based Similarity Measurement
- Data set: WoS

#### Data Import & Preparation

In [4]:
dir = "H:/GD_awekimm/[HGU]/[Research]/12_허지수/00_SemanticProximity/SemanticProximity_research/"

In [5]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load the datasets
pub_data = pd.read_csv(dir+'pub_bertopic_chatgpt.csv')
quantum_data = pd.read_csv(dir+'quantum_pub_bertopic_chatgpt.csv')

In [7]:
pub_data.head()

Unnamed: 0,eu_nuts_id,period,keyword,content
0,UKM25,1,"['nomenclature', 'names', 'botanical', 'intern...",International Botanical Nomenclature and Class...
1,UKM25,1,"['galaxies', 'star', 'dust', 'galaxy', 'redshi...",Infrared Emission and Stellar Mass in Distant ...
2,UKM25,1,"['wireless', 'channel', 'modulation', 'mimo', ...",Advanced Wireless Communication Techniques for...
3,UKM25,1,"['patients', 'cells', 'cell', 'species', 'expr...",Impact of Increased Cell Expression in Disease...
4,UKM25,1,"['bar', 'detector', 'gamma', 'decays', 'gt', '...",Gamma Decay Detection and Branching Ratios in ...


#### BERT Similairty

In [6]:
# Function to get BERT embeddings
def get_bert_embedding(text):
    # Tokenize and encode the input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    # Get the output from BERT
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the embeddings for the [CLS] token (the first token)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding

# Function to calculate BERT-based similarity
def calculate_bert_similarity(df1, df2, region, period):
    # Filter data by region and period
    df1_filtered = df1[(df1['eu_nuts_id'] == region) & (df1['period'] == period)]
    df2_filtered = df2[(df2['eu_nuts_id'] == region) & (df2['period'] == period)]
    
    # Check if both filtered datasets have data
    if df1_filtered.empty or df2_filtered.empty:
        return None  # Return None if there's no data for the given region and period
    
    # Concatenate the 'content' field to create a document for each set
    doc1 = ' '.join(df1_filtered['content'].astype(str)).lower()
    doc2 = ' '.join(df2_filtered['content'].astype(str)).lower()
    
    # Preprocess the documents by removing stopwords
    doc1 = remove_stopwords(doc1)
    doc2 = remove_stopwords(doc2)
    
    # Get BERT embeddings
    embedding1 = get_bert_embedding(doc1)
    embedding2 = get_bert_embedding(doc2)
    
    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    
    return similarity[0][0]  # Return the similarity score

# Apply similarity calculation for each unique combination of region and period
regions = pub_data['eu_nuts_id'].unique()
periods = pub_data['period'].unique()

results = []

for region in regions:
    for period in periods:
        bert_score = calculate_bert_similarity(pub_data, quantum_data, region, period)
        results.append({'eu_nuts_id': region, 'period': period, 'bert_similarity': bert_score})

# Convert results to DataFrame
similarity_df = pd.DataFrame(results)

# Display or save the results
print(similarity_df) 

similarity_df.to_csv('similarity_bert.csv', index=False)

In [57]:
import re
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

regions = pub_data['eu_nuts_id'].unique()
periods = pub_data['period'].unique()

results = []
for region in regions:
    for period in periods:
        
        df1_filtered = pub_data[(pub_data['eu_nuts_id'] == region) & (pub_data['period'] == period)]
        df2_filtered = quantum_data[(quantum_data['eu_nuts_id'] == region) & (quantum_data['period'] == period)]
        
        # Not much effective for this case
        doc1 = [re.sub(r"<(.*?)>", r"\1", item) for item in df1_filtered['content'].to_list()]
        doc1 = [item.replace('"', '') for item in doc1]
        doc2 = [re.sub(r"<(.*?)>", r"\1", item) for item in df2_filtered['content'].to_list()]
        doc2 = [item.replace('"', '') for item in doc2]

        embeddings_a = model.encode(doc1, convert_to_tensor=True)  
        embeddings_b = model.encode(doc2, convert_to_tensor=True)  

        similarities = []
        for embed_a in embeddings_a:  
            for embed_b in embeddings_b:  
                similarity = util.cos_sim(embed_a, embed_b).item()  
                similarities.append(similarity)
        average_similarity = np.mean(similarities)

        results.append({'eu_nuts_id': region, 'period': period, 'bert_similarity': average_similarity})

similarity_df = pd.DataFrame(results)

similarity_df.to_csv(dir+'similarity_bert_ed.csv', index=False)

In [64]:
region='UKF14'
period=4
df1_filtered = pub_data[(pub_data['eu_nuts_id'] == region) & (pub_data['period'] == period)]
df2_filtered = quantum_data[(quantum_data['eu_nuts_id'] == region) & (quantum_data['period'] == period)]

# Not much effective for this case
doc1 = [re.sub(r"<(.*?)>", r"\1", item) for item in df1_filtered['content'].to_list()]
doc1 = [item.replace('"', '') for item in doc1]
doc2 = [re.sub(r"<(.*?)>", r"\1", item) for item in df2_filtered['content'].to_list()]
doc2 = [item.replace('"', '') for item in doc2]

embeddings_a = model.encode(doc1, convert_to_tensor=True)  # A의 임베딩
embeddings_b = model.encode(doc2, convert_to_tensor=True)  # B의 임베딩
