# **Similarity Measurement**

In [1]:
import torch
print("Is CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(0))  # 첫 번째 GPU 이름 출력

  from .autonotebook import tqdm as notebook_tqdm


Is CUDA available: True
CUDA device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 4070 Laptop GPU


**Data Import and Preprocessing**

In [4]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load the datasets
pub_data = pd.read_csv('eu_pub_bertopic_nuts2_labels.csv')
patent_data = pd.read_csv('eu_patent_bertopic_nuts2_labels.csv')

**BERT-based similarity measurement**

In [7]:
import re
import pandas as pd
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
print(f"Model is loaded on device: {model.device}")

results = []

for region in pub_data['nuts'].unique():
    for period in pub_data['period'].unique():
        df1_filtered = pub_data[(pub_data['nuts'] == region) & (pub_data['period'] == period)]
        df2_filtered = patent_data[(patent_data['nuts'] == region) & (patent_data['period'] == period)]
        
        doc1 = [re.sub(r'<.*?>|["]', '', item) for item in df1_filtered['content'].dropna().tolist()]
        doc2 = [re.sub(r'<.*?>|["]', '', item) for item in df2_filtered['content'].dropna().tolist()]
        
        if not doc1 or not doc2:
            continue  # Skip if either set is empty

        # Encode and move to same device
        embeddings_a = model.encode(doc1, convert_to_tensor=True).to(model.device)
        embeddings_b = model.encode(doc2, convert_to_tensor=True).to(model.device)

        # Compute cosine similarity and convert to CPU before .numpy()
        similarities = util.pytorch_cos_sim(embeddings_a, embeddings_b).cpu().numpy()
        average_similarity = similarities.mean()

        results.append({'eu_nuts_id': region, 'period': period, 'bert_similarity': average_similarity})

# Convert results to DataFrame
similarity_df = pd.DataFrame(results)

# Save to CSV
similarity_df.to_csv('similarity_bert.csv', index=False)

Model is loaded on device: cuda:0


In [8]:
### Check

region='FR71'
period=3
df1_filtered = pub_data[(pub_data['nuts'] == region) & (pub_data['period'] == period)]
df2_filtered = patent_data[(patent_data['nuts'] == region) & (patent_data['period'] == period)]

# Not much effective for this case
doc1 = [re.sub(r"<(.*?)>", r"\1", item) for item in df1_filtered['content'].to_list()]
doc1 = [item.replace('"', '') for item in doc1]
doc2 = [re.sub(r"<(.*?)>", r"\1", item) for item in df2_filtered['content'].to_list()]
doc2 = [item.replace('"', '') for item in doc2]

embeddings_a = model.encode(doc1, convert_to_tensor=True)  # A의 임베딩
embeddings_b = model.encode(doc2, convert_to_tensor=True)  # B의 임베딩

# Version 1
similarities = util.pytorch_cos_sim(embeddings_a, embeddings_b).cpu().numpy()
average_similarity = similarities.mean()
# Version 2
# similarities = []
# for embed_a in embeddings_a:  
#     for embed_b in embeddings_b:  
#         similarity = util.cos_sim(embed_a, embed_b).item()  
#         similarities.append(similarity)
# average_similarity = np.mean(similarities)
print(doc1)
print(doc2)
print(average_similarity)

['Advanced 3D Photodetector Technologies for Pulsed and Continuous Wave Applications', 'Impact of Climate Change on Aquatic Ecosystem Dynamics and Biodiversity', 'Computational Control Systems in Networked Production and Radar Technologies', 'Nonlinear Dynamics in Biological Systems', 'Relativistic Astrophysical Plasma and Particle Physics', 'Superconducting Phase Transition and Magnetic Susceptibility Studies', 'Hydrogenation Catalysis Using Metal-Modified Alumina Supports', 'Synthesis and Characterization of Silica Aerogels']
['Advanced Textile Steam Treatment Technologies', 'Alpine Ski Boot Construction and Binding Mechanisms', 'Electronic Circuit Signal Processing', 'Catalytic Acid-Metal Reaction Processes in Chemical Synthesis', 'Fixed-Position Imaging Device Technology']
0.059860118


In [1]:
import pandas as pd 
similarity = pd.read_csv('1-4_similarity_bert.csv')
similarity

Unnamed: 0,eu_nuts_id,period,bert_similarity
0,FR71,3,0.059860
1,FR71,4,0.054019
2,FR71,5,0.026632
3,FR71,6,0.026299
4,FR71,7,0.059980
...,...,...,...
664,AT21,4,0.044976
665,AT21,5,0.011215
666,ITH1,5,0.000703
667,AT34,5,0.047280


In [None]:
similarity.sort_values(by='bert_similarity', ascending=False, inplace=False)

Unnamed: 0,eu_nuts_id,period,bert_similarity
316,NL22,5,0.145414
300,NL42,4,0.144541
331,NL34,5,0.139538
213,UKJ1,5,0.138198
274,BE23,4,0.136340
...,...,...,...
49,DE14,5,-0.013209
127,DE11,4,-0.016594
451,FI19,4,-0.023940
126,SE11,7,-0.030735
