In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    OpenAI.api_key = openai_api_key
client = OpenAI()


import pandas as pd
import numpy as np
import time

In [2]:
BATCH_SIZE = 1000
DIMENTION_EMBEDDING = 100
EMBEDDING_MODEL = 'text-embedding-3-small'

In [3]:
# OpenAI APIを使ってテキストをエンベディング
def get_embedding(input_text, dimention=DIMENTION_EMBEDDING, model=EMBEDDING_MODEL):
    if type(input_text) == str:
        input_text = [input_text]
    try:
        response = client.embeddings.create(
            model=model,
            input=input_text,
            dimensions=dimention
        )
        if len(input_text)==1:
            response.data[0].embedding
        else:
            return [e.embedding for e in response.data]
    except Exception as e:
        print(f"Error: {e}")
        return None
    
def get_embedding_batch(input_text, dimention=DIMENTION_EMBEDDING, batch_size=BATCH_SIZE, model=EMBEDDING_MODEL):
    if type(input_text) == str:
        input_text = [input_text]
    embeddings = []
    for batch_start in range(0, len(input_text), batch_size):
        batch_end = batch_start + batch_size
        batch = input_text[batch_start:batch_end]
        try:
            response = client.embeddings.create(
                model=model,
                input=batch,
                dimensions=dimention
            )
            embeddings.extend([e.embedding for e in response.data])
        except Exception as e:
            print(f"Error: {e}")
            return None
    return embeddings

In [5]:
idol_list = pd.read_csv('../../data/master/idolname_20241113.csv', encoding='utf-8')
idol_list_before = pd.read_csv('../../data/master/idolname_embedding_data.csv', encoding='utf-8')


In [9]:
# CSVファイルを読み込む
idol_list = pd.read_csv('../../data/master/idolname_20241113.csv', encoding='utf-8')
try:
    idol_data = pd.read_csv('../../data/master/idolname_embedding_data.csv', encoding='utf-8')
    idol_list_old = list(idol_data["idol_group_name"])
except FileNotFoundError:
    idol_data = pd.DataFrame([],columns=["idol_group_name"])
    idol_list_old = []

new_idol_list = []
for idol in idol_list["idol_group_name"]:
    if idol not in idol_list_old:
        new_idol_list.append(idol)

# レート制限を考慮して、各テキストに対してエンベディングを取得
#バッジ処理でできるように書き換えたい
embeddings = get_embedding_batch(new_idol_list)

# embeddings = []
# for text in new_idol_list:
#     embedding = get_embedding(text)
#     if embedding is not None:
#         embeddings.append(embedding)
#     else:
#         embeddings.append([0]*1536)  # エンベディングサイズに合わせてゼロベクトルを挿入
#     time.sleep(0.1)  # レート制限を避けるために待機

In [10]:
new_data = pd.concat((pd.DataFrame(new_idol_list,columns=["idol_group_name"]),pd.DataFrame(embeddings)),axis=1)
new_data.columns = [str(col) for col in new_data.columns]
pd.concat((idol_data,new_data)).to_csv('../../data/master/idolname_embedding_data.csv', encoding='utf-8', index=False)

In [21]:
def normalize_l2(X,dim):
    X = np.array(X)[:,:dim]
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    # ノルムがゼロの要素を特定
    zero_norms = (norms == 0)
    # ゼロ除算を防ぐためにノルムがゼロの要素を1に置換
    norms[zero_norms] = 1
    # 各ベクトルを対応するノルムで割る
    X_normalized = X / norms
    # 元のノルムがゼロだったベクトルを元のままに戻す
    X_normalized[zero_norms.flatten()] = X[zero_norms.flatten()]
    return X_normalized

cut_dim = embeddings
norm_dim = normalize_l2(cut_dim,100)

In [22]:
norm_dim.shape

(6010, 100)

In [25]:
idolname_embedding_data = pd.DataFrame(norm_dim,index=texts)#100次元
idolname_embedding_data.index.name = "idol_group_name"
idolname_embedding_data.to_csv('../../data/master/idolname_embedding_data.csv', encoding='utf-8')