### Semantic Proximity Research
- Made by: Dr. Jisoo Hur
- Goal: 05. BERT-based Similarity Measurement
- Data set: WoS

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load the datasets
pub_data = pd.read_csv('pub_bertopic_chatgpt.csv')
quantum_data = pd.read_csv('quantum_pub_bertopic_chatgpt.csv')

# Function to get BERT embeddings
def get_bert_embedding(text):
    # Tokenize and encode the input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    # Get the output from BERT
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the embeddings for the [CLS] token (the first token)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding

# Function to calculate BERT-based similarity
def calculate_bert_similarity(df1, df2, region, period):
    # Filter data by region and period
    df1_filtered = df1[(df1['eu_nuts_id'] == region) & (df1['period'] == period)]
    df2_filtered = df2[(df2['eu_nuts_id'] == region) & (df2['period'] == period)]
    
    # Check if both filtered datasets have data
    if df1_filtered.empty or df2_filtered.empty:
        return None  # Return None if there's no data for the given region and period
    
    # Concatenate the 'content' field to create a document for each set
    doc1 = ' '.join(df1_filtered['content'].astype(str)).lower()
    doc2 = ' '.join(df2_filtered['content'].astype(str)).lower()
    
    # Preprocess the documents by removing stopwords
    doc1 = remove_stopwords(doc1)
    doc2 = remove_stopwords(doc2)
    
    # Get BERT embeddings
    embedding1 = get_bert_embedding(doc1)
    embedding2 = get_bert_embedding(doc2)
    
    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    
    return similarity[0][0]  # Return the similarity score

# Apply similarity calculation for each unique combination of region and period
regions = pub_data['eu_nuts_id'].unique()
periods = pub_data['period'].unique()

results = []

for region in regions:
    for period in periods:
        bert_score = calculate_bert_similarity(pub_data, quantum_data, region, period)
        results.append({'eu_nuts_id': region, 'period': period, 'bert_similarity': bert_score})

# Convert results to DataFrame
similarity_df = pd.DataFrame(results)

# Display or save the results
print(similarity_df)

In [None]:
similarity_df.to_csv('similarity_bert.csv', index=False)