### Gemini APIの前準備

In [1]:
# パッケージのインストール
!pip install google-generativeai

[0m

In [3]:
# from google.colab import userdata
import google.generativeai as genai
from dotenv import load_dotenv
import os

load_dotenv()
# 環境変数の準備 (左端の鍵アイコンでGOOGLE_API_KEYを設定)
GOOGLE_API_KEY=os.environ.get("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

### 埋め込みモデル一覧の確認

In [4]:
import google.generativeai as genai

# 埋め込みモデル一覧の取得
for m in genai.list_models():
    if "embedContent" in m.supported_generation_methods:
        print(m.name)

models/embedding-001
models/text-embedding-004


### text-embedding-004の使い方

In [5]:
# 埋め込み関数の準備
def embedding(texts):
    return genai.embed_content(
        model="models/text-embedding-004",
        content=texts,
    )["embedding"]

In [6]:
# テキストの準備
texts = ["This is a Test."]

# 埋め込みベクトルへの変換
embeds = embedding(texts)
print(embeds)

[[0.021709729, -0.010004897, -0.07831449, 0.00021625146, 0.016366882, -0.0061589633, 0.056820635, 0.03478271, -0.0053757853, 0.035010446, -4.1074087e-05, 0.01616381, 0.046615493, -0.01919706, -0.0007648253, -0.019785572, 0.026104966, 0.067962535, -0.06785214, -0.031471316, 0.020794155, -0.04611739, 0.0019919872, -0.035202015, -0.026550822, -0.033206027, 0.008064024, 0.003174477, 0.025845889, -0.021457084, 0.012842088, 0.043763965, 0.031452913, -0.004718091, 0.012690874, -0.011095253, -0.012762027, 0.020994188, 0.020329844, -0.080613144, 0.0061794934, 0.08040166, -0.076733366, -0.00773825, -0.025886035, -0.041227743, 0.038715918, 0.0044989176, 0.003333499, -0.0023913558, 0.0407256, 0.051695146, -0.057483457, -0.0028400905, 0.015980821, -0.009783751, 0.013114645, -0.046905294, 0.052408155, 0.0075120507, 0.041234136, -0.02858687, -0.0052159308, 0.005603428, 0.013024177, -0.0059801494, 0.014997179, -0.019358763, -0.03909497, -0.029858405, -0.071830705, 0.018266415, 0.0065697925, -0.0085571

In [7]:
type(embeds[0][0])

float

In [8]:
# 埋め込みベクトルの長さの確認
print(len(embeds[0]))

768


### text-embedding-004の近傍探索

In [9]:
# Faissのパッケージのインストール
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0
[0m

In [10]:
# 入力テキスト
in_texts = [
    "I'm glad it didn't rain today"
]

# 対象テキスト
target_texts = [
    "What is your favorite food?",
    "Where do you live?",
    "Morning trains are crowded.",
    "It's nice weather today.",
    "The economy is bad lately.",
]

# 埋め込みベクトルへの変換
in_embeds = embedding(in_texts)
target_embeds = embedding(target_texts)

In [11]:
import numpy as np

# numpyへの変換
in_embeds = np.array(in_embeds).astype("float32")
target_embeds = np.array(target_embeds).astype("float32")

In [12]:
type(in_embeds[0][0])

numpy.float32

In [13]:
np.__version__

'1.26.4'

In [16]:
# python 3.12由来のエラー

import faiss
# Faissのインデックス生成
index = faiss.IndexFlatL2(len(in_embeds[0]))

In [17]:
# 対象テキストをインデックスに追加
index.add(target_embeds)

In [18]:
# 近傍探索の実行
distances, indices = index.search(in_embeds, 1)

# 確認
print(distances)
print(indices)
print(target_texts[indices[0][0]])

[[0.5306722]]
[[3]]
It's nice weather today.


### bge-m3の使い方

In [38]:
# bge-m3のパッケージのインストール
!pip install FlagEmbedding

Collecting FlagEmbedding
  Using cached FlagEmbedding-1.3.2-py3-none-any.whl
Collecting transformers==4.44.2 (from FlagEmbedding)
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting datasets==2.19.0 (from FlagEmbedding)
  Using cached datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate>=0.20.1 (from FlagEmbedding)
  Using cached accelerate-1.1.0-py3-none-any.whl.metadata (19 kB)
Collecting sentence-transformers (from FlagEmbedding)
  Using cached sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting peft (from FlagEmbedding)
  Using cached peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting ir-datasets (from FlagEmbedding)
  Using cached ir_datasets-0.5.8-py3-none-any.whl.metadata (12 kB)
Collecting pyarrow>=12.0.0 (from datasets==2.19.0->FlagEmbedding)
  Using cached pyarrow-18.0.0-cp312-cp312-manylinux_2_28_aarch64.whl.metadata (3.3 kB)
Collecting pyarrow-hotfix (from datasets==2.19.0->FlagEmbedding)
  Using cach

In [39]:
from FlagEmbedding import BGEM3FlagModel

# 埋め込みモデルの準備
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)

# 埋め込み関数の準備
def embedding(texts):
    return model.encode(texts)["dense_vecs"]

Fetching 30 files: 100%|██████████| 30/30 [02:04<00:00,  4.15s/it]


In [40]:
# テキストの準備
texts = ["これはテストです。"]

# テキストを埋め込みベクトルに変換
embeds = embedding(texts)
print(embeds)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[[-0.02553461  0.01737304 -0.03744124 ... -0.02553592 -0.00239796
   0.00116605]]


In [41]:
# 埋め込みベクトルの長さの確認
print(len(embeds[0]))

1024


### bge-m3の近傍探索

In [42]:
# Faissパッケージのインストール
!pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [43]:
# 入力テキスト
in_texts = [
    "今日は雨振らなくてよかった"
]

# 対象テキスト
target_texts = [
    "好きな食べ物は何ですか?",
    "どこにお住まいですか?",
    "朝の電車は混みますね",
    "今日は良いお天気ですね",
    "最近景気悪いですね"
]

# Embeddingの作成
in_embeds = embedding(in_texts)
target_embeds = embedding(target_texts)

In [44]:
import numpy as np

# numpyへの変換
in_embeds = np.array(in_embeds).astype("float32")
target_embeds = np.array(target_embeds).astype("float32")

In [47]:
target_embeds.shape

(5, 1024)

In [46]:
import faiss

# Faissのインデックス生成
index = faiss.IndexFlatL2(len(in_embeds[0]))

ModuleNotFoundError: No module named 'numpy.distutils'

In [None]:
# 対象テキストをインデックスに追加
index.add(target_embeds)

In [None]:
# 近傍探索の実行
distances, indices = index.search(in_embeds, 1)

# 確認
print(distances)
print(indices)
print(target_texts[indices[0][0]])