<a href="https://colab.research.google.com/github/Hideyuki-Machida/ML_demos/blob/main/JSTS%E3%83%87%E3%83%BC%E3%82%BF%E3%82%BB%E3%83%83%E3%83%88%E3%81%A7%E6%97%A5%E6%9C%AC%E8%AA%9E%E6%96%87%E3%83%9A%E3%82%A2%E6%8E%A8%E8%AB%96%E8%A9%95%E4%BE%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# JSTSデータセットで日本語文ペア推論評価

こちらのJSTSデータセットを使用して、各SentenceTransformersの埋め込みベクターの文ペアの意味的類似性の精度を評価する。

https://github.com/yahoojapan/JGLUE

SentenceTransformers

https://www.sbert.net/docs/pretrained_models.html#semantic-search

こちらを参考にさせていただきました。

https://github.com/nyanta012/demo/blob/main/embedding_comparison.ipynb

In [1]:
!nvidia-smi

Sun Jul 23 09:47:46 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## インストール

In [2]:
!pip install sentence-transformers
!pip install datasets
!pip install fugashi
!pip install ipadic

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[

## データセットの設定

In [3]:
import json
import pandas as pd
from urllib.request import urlopen

jsts_url = "https://raw.githubusercontent.com/yahoojapan/JGLUE/main/datasets/jsts-v1.1/valid-v1.1.json"
df = pd.DataFrame([json.loads(line) for line in urlopen(jsts_url).readlines()])

In [4]:
df

Unnamed: 0,sentence_pair_id,yjcaptions_id,sentence1,sentence2,label
0,0,100312_421853-104611-31624,レンガの建物の前を、乳母車を押した女性が歩いています。,厩舎で馬と女性とが寄り添っています。,0.0
1,1,100371-104675-104678,山の上に顔の白い牛が2頭います。,曇り空の山肌で、牛が２匹草を食んでいます。,2.4
2,2,100668-104946-104949,バナナを持った人が道路を通行しています。,道の上をバナナを背負った男性が歩いています。,3.6
3,3,100958-105177-105178,スケートボーダーが手すりを滑っています。,階段の手すりでスケートボードをする男性がいます。,4.0
4,4,101401-105530-105533,ダブルベッドの上で、女性が足を組み横たわっています。,ベッドの上に寝転んで、足を組んでいる人が映っています。,3.0
...,...,...,...,...,...
1452,1452,98940-103167-103171,男性が携帯電話を耳に当てて通話しています。,建物の前でシャツを着た男性が携帯電話を耳に当てて通話しています。,3.6
1453,1453,99222-103520-103521,カウンターに、バナナ、サニーレタス、マッシュルームなどが置かれています。,キッチンの机の上に野菜と果物がたくさん並べられています。,3.8
1454,1454,99421-103771-103773,棚に白い電子レンジが置いてあります。,レンジと調理家電が棚の上に乗っています。,2.6
1455,1455,99453-103814-103815,部屋の中に置かれた自転車のサドルに猫が座っています。,白い猫のしっぽが一匹自転車の荷台に乗っています。,2.8


## 文ペアのコサイン類似度を取得

文ペアの埋め込みベクトルを取得してコサイン類似度を取得

In [5]:
import torch
from torch.nn.functional import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

# Sentence-Transformersモデルの準備
MODEL_NAME = "intfloat/multilingual-e5-large"
# MODEL_NAME = "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
# MODEL_NAME = "embaas/sentence-transformers-e5-large-v2"
# MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
# MODEL_NAME = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"
# MODEL_NAME = "sentence-transformers/stsb-xlm-r-multilingual"
# MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
# MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# MODEL_NAME = "sentence-transformers/distiluse-base-multilingual-cased-v1"
# MODEL_NAME = "sentence-transformers/distiluse-base-multilingual-cased-v2"
model = SentenceTransformer(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

similarities = []

for i, row in tqdm(df.iterrows()):
    # 対のセンテンスを取得
    input_texts = [row["sentence1"], row["sentence2"]]
    # input_texts = ["query: " + row["sentence1"], "query: " + row["sentence2"]]

    # 埋め込みベクトルを取得
    batch_embeddings = model.encode(input_texts, show_progress_bar=False, device=device)

    # コサイン類似度を計算
    similarity = util.cos_sim(batch_embeddings[0], batch_embeddings[1])
    similarities.append(similarity.item())


Downloading (…)f7a27/.gitattributes:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Downloading (…)06d35f7a27/README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

Downloading (…)d35f7a27/config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading (…)a27/onnx/config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

Downloading model.onnx:   0%|          | 0.00/546k [00:00<?, ?B/s]

Downloading model.onnx_data:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading (…)35f7a27/modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

0it [00:00, ?it/s]

## 文ペアのコサイン類似度性を評価

In [6]:
# 評価する
from scipy.stats import pearsonr, spearmanr

def eval_corr():
    # 配列を定義（例）
    x = similarities
    y = df["label"]

    # ピアソン相関係数を計算
    pearson_corr, _ = pearsonr(x, y)
    print(f'Pearson correlation: {pearson_corr}')

    # スピアマン相関係数を計算
    spearman_corr, _ = spearmanr(x, y)
    print(f'Spearman correlation: {spearman_corr}')

eval_corr()

Pearson correlation: 0.8503602531223328
Spearman correlation: 0.8098707817267512


In [7]:
# intfloat/multilingual-e5-large
# Pearson correlation: 0.8503602431405118
# Spearman correlation: 0.8098725005711473

# intfloat/multilingual-e5-large (query:)
# Pearson correlation: 0.8620480405391278
# Spearman correlation: 0.8185366371041045

# embaas/sentence-transformers-e5-large-v2
#　Pearson correlation: 0.6369473720329537
#　Spearman correlation: 0.6347079905354115

# sonoisa/sentence-bert-base-ja-mean-tokens-v2
#　Pearson correlation: 0.8616167407674791
#　Spearman correlation: 0.8087245023635142



# sentence-transformers/all-MiniLM-L6-v2
# Pearson correlation: 0.6345837681661725
# Spearman correlation: 0.6264711567995093

# sentence-transformers/paraphrase-xlm-r-multilingual-v1
# Pearson correlation: 0.8376903792793898
# Spearman correlation: 0.7841414510056081

#　sentence-transformers/stsb-xlm-r-multilingual
#　Pearson correlation: 0.8320184606054799
#　Spearman correlation: 0.7843004322494745

# sentence-transformers/paraphrase-multilingual-mpnet-base-v2
# Pearson correlation: 0.8395266654991814
# Spearman correlation: 0.7946156433517679

# sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
# Pearson correlation: 0.8323287371818228
# Spearman correlation: 0.7782890018007536

#　sentence-transformers/distiluse-base-multilingual-cased-v1
#　Pearson correlation: 0.780879250998077
#　Spearman correlation: 0.7391699246312541

#　sentence-transformers/distiluse-base-multilingual-cased-v2
#　Pearson correlation: 0.8102770729216254
#　Spearman correlation: 0.7624937447808708



# rinna/japanese-gpt2-medium
# Pearson correlation: 0.7586635501874704
# Spearman correlation: 0.7118068175773248

# rinna/japanese-gpt-neox-3.6b
# Pearson correlation: 0.7112765150150083
# Spearman correlation: 0.6868566506130712
