In [1]:
import numpy as np
import torch


#### 연습 문장

In [2]:
input_str = "I love holiday"
ouput_str = "나는 연휴를 사랑해"

In [4]:
# dot1 = np.dot("I", "나는") # the underlying ufunc (multiply) only works with numbers — not text
# dot2 = np.dot("love", "나는")
# dot3 = np.dot("holiday", "나는")

## Most of all, I have to tokenize each word to represent the meaning as a number.

[Gemini Embedding Reference URL](https://ai.google.dev/gemini-api/docs/embeddings?hl=ko)

`export GEMINI_API_KEY=<YOUR_API_KEY_HERE>`

In [None]:
from google import genai

client = genai.Client(api_key=GEMINI_API_KEY)

result = client.models.embed_content(
        model="gemini-embedding-001",
        contents="What is the meaning of life?")

print(result.embeddings)

[ContentEmbedding(
  values=[
    -0.022374554,
    -0.004560777,
    0.013309286,
    -0.0545072,
    -0.02090443,
    <... 3067 more items ...>,
  ]
)]


In [8]:
response = client.models.generate_content(
    model="gemini-2.5-flash", contents="Concretly explain how Gemini Embedding Model works effectively comparing to other sources"
    # contents="Explain how AI works in a few words" >>> 'AI learns patterns from vast amounts of data to make predictions or perform tasks.'
)
print(response.text)

The Gemini Embedding Model, like other powerful embedding models, translates textual information into numerical representations called "embeddings" or "vectors." These vectors capture the semantic meaning of the text such that texts with similar meanings are located closer together in a high-dimensional space, while dissimilar texts are further apart.

Let's break down how it works effectively and then compare it to other sources.

---

### 1. How the Gemini Embedding Model Works Concretely

At its core, the Gemini Embedding Model leverages a sophisticated deep learning architecture, primarily based on the **Transformer encoder** design, which has proven highly effective for natural language understanding.

Here's a step-by-step breakdown:

1.  **Tokenization:**
    *   **Input:** You provide a piece of text (e.g., "The quick brown fox jumps over the lazy dog.")
    *   **Process:** The model first breaks down this text into smaller units called "tokens." These can be words, sub-word u

## Extract Vectors with Gemini Embedding in earnest

In [48]:
em_query = client.models.embed_content(model="gemini-embedding-001", contents="나는")
print(type(em_query))
em_query

<class 'google.genai.types.EmbedContentResponse'>


EmbedContentResponse(
  embeddings=[
    ContentEmbedding(
      values=[
        -0.010612635,
        0.0059433444,
        0.005594396,
        -0.07262013,
        -0.0070909816,
        <... 3067 more items ...>,
      ]
    ),
  ],
  sdk_http_response=HttpResponse(
    headers=<dict len=10>
  )
)

In [43]:
try1 = em_query.embeddings
[try2] = em_query.embeddings

print(type(try1))
print(type(try2))

<class 'list'>
<class 'google.genai.types.ContentEmbedding'>


In [49]:
[em_q] = em_query.embeddings
em_Q = em_q.values
len(em_Q)

3072

> 문자열 목록으로 전달하여 한 번에 여러 청크의 임베딩을 생성할 수도 있습니다.

In [21]:
# em_I = client.models.embed_content(
#         model="gemini-embedding-001",
#         contents="I")
# em_love = client.models.embed_content(
#         model="gemini-embedding-001",
#         contents="love")
# em_holiday = client.models.embed_content(
#         model="gemini-embedding-001",
#         contents="holiday")

em_keys = client.models.embed_content(
        model="gemini-embedding-001",
        contents=["I", "love", "holiday"])
em_keys

EmbedContentResponse(
  embeddings=[
    ContentEmbedding(
      values=[
        -0.005278527,
        0.01615053,
        0.011982469,
        -0.074000314,
        0.0026567031,
        <... 3067 more items ...>,
      ]
    ),
    ContentEmbedding(
      values=[
        -0.0053034346,
        0.00989779,
        -0.0020667156,
        -0.05728348,
        0.0014906529,
        <... 3067 more items ...>,
      ]
    ),
    ContentEmbedding(
      values=[
        -0.022153739,
        -0.0012626448,
        -0.012944534,
        -0.067008846,
        -0.017269898,
        <... 3067 more items ...>,
      ]
    ),
  ],
  sdk_http_response=HttpResponse(
    headers=<dict len=10>
  )
)

In [22]:
em_I, em_love, em_holiday = em_keys.embeddings

In [26]:
len(em_I.values)

3072

In [51]:
dot1 = np.dot(em_I.values, em_Q)
dot2 = np.dot(em_love.values, em_Q)
dot3 = np.dot(em_holiday.values, em_Q)

In [53]:
print(f"dot1 between \"I\" and \"나는\"\n: {dot1}\n")
print(f"dot2 \"love\" and \"나는\"\n: {dot2}\n")
print(f"dot3 \"holiday\" and \"나는\"\n: {dot3}\n")

dot1 between "I" and "나는"
: 0.6739418859925859

dot2 "love" and "나는"
: 0.6114853961970522

dot3 "holiday" and "나는"
: 0.5855955563786995



In [None]:
# dot1 = np.dot("I", "나는")      >>> np.dot does not work on strings!!
# dot2 = np.dot("love", "나는")
# dot3 = np.dot("holiday", "나는")

### Dim-deduction test

In [55]:
from google import genai
from google.genai import types

em_result = client.models.embed_content(
        model="gemini-embedding-001",
        contents=["나는", "I", "love", "holiday"],
        config=types.EmbedContentConfig(outputDimensionality=512))

em1, em2, em3, em4 = em_result.embeddings

In [56]:
dot_1 = np.dot(em1.values, em2.values)
dot_2 = np.dot(em1.values, em3.values)
dot_3 = np.dot(em1.values, em4.values)

print(dot_1)
print(dot_2)
print(dot_3)

0.12782337001717595
0.11915654657890391
0.11436667282861002


#### 작은 크기의 품질 보장

> 정규화된 임베딩은 크기가 아닌 벡터 방향을 비교하여 더 정확한 의미 유사성을 생성합니다. <br>768, 1536을 비롯한 다른 차원의 경우 다음과 같이 임베딩을 정규화해야 합니다.

In [58]:
import numpy as np
from numpy.linalg import norm

em1_values_np = np.array(em1.values)
normed_em1 = em1_values_np / np.linalg.norm(em1_values_np)

print(f"Normed embedding length: {len(normed_em1)}")
print(f"Norm of normed embedding: {np.linalg.norm(normed_em1):.6f}") # Should be very close to 1

Normed embedding length: 512
Norm of normed embedding: 1.000000


In [62]:
import numpy as np
from numpy.linalg import norm

def normalize(v):
  np_v = np.array(v) # list to numpy array
  normed_v = np_v / np.linalg.norm(np_v)

  return normed_v

sim1 = np.dot(normed_em1, normalize(em2.values))
sim2 = np.dot(normed_em1, normalize(em3.values))
sim3 = np.dot(normed_em1, normalize(em4.values))

print("  <After normailzation of the reduced vectors>\n")
print(f"sim1: {sim1}")
print(f"sim2: {sim2}")
print(f"sim3: {sim3}")

  <After normailzation of the reduced vectors>

sim1: 0.6160727695573414
sim2: 0.5639979865058795
sim3: 0.5348007357962068


> 이들은 성능이 임베딩 차원의 크기와 엄격하게 연결되어 있지 않으며, 낮은 차원이 높은 차원과 비슷한 점수를 달성할 수 있다고 함!