# gpt-2を使ってembeddingを作る簡単なコード

In [1]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model
import os
from tqdm.auto import tqdm
import numpy as np

In [None]:
anime_info = pd.read_csv('anime.csv')
train_df = pd.read_csv('train/train.csv')

test_df = pd.read_csv('test/test.csv')

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text_columns = ['genres', 'japanese_name', 'type', 'episodes', 'aired', 'producers', 'licensors', 'studios', 'source', 'duration', 'rating', 'members', 'watching', 'completed', 'on_hold', 'dropped', 'plan_to_watch']

def get_gpt2_embeds(text):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(input_ids)
        embeddings = outputs[0][0][0]
    return embeddings.detach().cpu().numpy()

def concat_with_column_names(row): # 前に列名を付けます
    return ' '.join([f'{col}: {row[col]}' for col in text_columns])


ids_list = []
embeddings = []
for i, row in tqdm(anime_info.iterrows(),total=2000):
    ids_list.append(row['anime_id'])
    text = concat_with_column_names(row[text_columns])
    embedding = get_gpt2_embeds(text)
    embeddings.append(embedding)
np.save('#15/train/train_ids.npy',np.array(ids_list))
np.save('#15/train/train_embeddings.npy',np.array(embeddings))

これで、サイズは768のembeddingが得られます

このようにデータを読み取ることができます

In [2]:
embeds = np.load("train/train_embeddings.npy")
ids = np.load("train/train_ids.npy")
embeds_list = []
for l in range(embeds.shape[0]):
    embeds_list.append(embeds[l,:])
anime = pd.DataFrame(data={"anime_id": ids, "embed" : embeds_list})


しかし、embeddingは全く同じようです

ファインチューニングが必要です

In [6]:
sum(anime.embed[0] == anime.embed[1])

768