In [1]:
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

In [5]:
# 53개의 상위 카테고리 데이터프레임만들기.

all_genre_list = ['Adventure', 'Disaster', 'Martial Arts', 'Military Action', 'Spy and Espionage',
                  'Superhero', 'Video game movies', 'Action comedy', 'Action crime', 'Action drama',
                  'Action-horror', 'Action thriller', 'Docudrama', 'Melodrama', 'Teen drama', 'Medical drama',
                  'Legal drama', 'Religious drama', 'Sports drama', 'Political drama', 'Anthropological drama',
                  'Philosophical drama', 'Contemporary and urban fantasy', 'Epic Fantasy', 'Fairy Tale', 'Dark Fantasy',
                  'Ghost', 'Zombie', 'Werewolf', 'Vampire', 'Monster', 'Slasher', 'Splatter and Gore', 'Body Horror',
                  'Folk Horror', 'Occult', 'Found Footage', 'Outbreak', 'Historical romance', 'Regency romance', 'Romantic drama',
                  'Romantic comedy', 'Chick Flick', 'Fantasy romance', 'Space Opera or epic sci-fi', 'Utopia', 'Dystopia', 'Contemporary Sci-Fi',
                  'Cyberpunk', 'Steampunk', 'Psychological thriller', 'Mystery', 'Film noir']


data = pd.DataFrame({'template_words':all_genre_list})
data

Unnamed: 0,template_words
0,Adventure
1,Disaster
2,Martial Arts
3,Military Action
4,Spy and Espionage
5,Superhero
6,Video game movies
7,Action comedy
8,Action crime
9,Action drama


# Embedding

(단어->벡터화)

* reference
https://openai.com/blog/new-and-improved-embedding-model

In [7]:
# 임베딩하기.

tqdm.pandas()


client = OpenAI(
    # open ai의 api key를 문자열로대입
    api_key="{api_key}"
)

# open ai의 text-embedding-ada-002 모델을 이용해서 단어->실수 로 "임베딩"진행.

def get_embedding(text: str, model="text-embedding-ada-002") -> list[float]:
   return client.embeddings.create(input=[text], model=model).data[0].embedding

data['ada_embedding'] = data['template_words'].progress_apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))


100%|██████████| 53/53 [00:14<00:00,  3.63it/s]


In [8]:

# Adventurn -> [0.013743538409471512, -0.013791930861771107, ... ] 
# 임베딩을 통해 단어를 실수로서 다룰 수 있게됨.
  # 이를 통해 단어간 유사도 계산이 가능해짐.

data 

Unnamed: 0,template_words,ada_embedding
0,Adventure,"[0.013743538409471512, -0.013791930861771107, ..."
1,Disaster,"[-0.018168097361922264, -0.019965510815382004,..."
2,Martial Arts,"[-0.012883997522294521, 0.0027785184793174267,..."
3,Military Action,"[-0.03524612635374069, -0.01798408292233944, -..."
4,Spy and Espionage,"[-0.02443336509168148, -0.0044959532096982, 0...."
5,Superhero,"[-0.013685225509107113, -0.019091086462140083,..."
6,Video game movies,"[-0.003580923890694976, -0.03175680711865425, ..."
7,Action comedy,"[-0.0029455029871314764, -0.030893906950950623..."
8,Action crime,"[-0.011898484081029892, -0.022693533450365067,..."
9,Action drama,"[-0.009050272405147552, -0.0315282866358757, -..."


# Faiss
(유사도 계산 라이브러리 및 벡터 DB)

* 목적 : 임베딩과정을 거친 단어는 실수 벡터로서 다룰 수 있게된다.
Faiss 라이브러리를 통해 벡터로 바뀐 단어들을 계산한다.


* reference
https://lsjsj92.tistory.com/605


In [9]:
# 사전에 GPT로 생성한 메타데이터들을 53개의 그룹으로 묶는다.
template = pd.read_pickle(r'E:\VOD추천 프로젝트 데이터\DATA\GPT_DATA\Embeddings\TemplateWords_Translated_EMB.pickle')
template

Unnamed: 0,template_words,ada_embedding,translated
0,frozenheart,"[-0.028733976185321808, -0.022937733680009842,...",프로즌하트
1,overseas,"[-0.00939271692186594, -0.019856899976730347, ...",해외
2,tourismmarket,"[-0.0048003788106143475, -0.011897329241037369...",투어리즘 마켓
3,internetstreaming,"[-0.022722924128174782, -0.021901289001107216,...",인터넷 스트리밍
4,meowmeow,"[-0.024937042966485023, -0.003026014193892479,...",야옹야옹
...,...,...,...
9161,tokyotribe,"[-0.031248198822140694, -0.005797793157398701,...",토쿄트라이브
9162,dorayaki,"[-0.01024614181369543, -0.008958597667515278, ...",도라야키
9163,bar,"[-0.0025195262860506773, -0.0167082492262125, ...",바
9164,deafness,"[-0.009098630398511887, 0.00014460322563536465...",청각 장애


In [10]:
import faiss
import numpy as np

# 리스트로만들고
emb = template['ada_embedding'].to_list()
# 넘파이 어레이로 변환
emb_np = np.array(emb,dtype=np.float32)
#  1536차원에 대한 인덱스 맵핑을만듬
index = faiss.IndexIDMap(faiss.IndexFlatIP(emb_np.shape[1]))

# Add vectors with IDs
# 벡터에 인덱스 아이디 부여
index.add_with_ids(emb_np, np.array(range(0, len(emb_np))))

In [11]:
def get_similar_words(ada_embedding: list[str], top_k: int) -> dict:
  
  query_vector = np.array(ada_embedding).reshape(1,-1)

  D,I = index.search(query_vector,top_k)
  sim_words_dict = {}
  for idx,distance  in zip(I[0],D[0]):

    # 상위 카테고리에 해당하는 카테고리 값을 유사도 점수(distance)와 함께 반환
    word = template.at[idx,'template_words']
    sim_words_dict[word] = distance
  
  return sim_words_dict

In [12]:
data['sim_words'] = data['ada_embedding'].apply(lambda elm : get_similar_words(elm,top_k=40))


In [14]:
# 53개의 상위 카테고리에 대한 단어 집합이 만들어졌다.

data

Unnamed: 0,template_words,ada_embedding,sim_words
0,Adventure,"[0.013743538409471512, -0.013791930861771107, ...","{'adventure': 0.9180647, 'adventures': 0.89708..."
1,Disaster,"[-0.018168097361922264, -0.019965510815382004,...","{'disaster': 0.97755146, 'worstdisaster': 0.91..."
2,Martial Arts,"[-0.012883997522294521, 0.0027785184793174267,...","{'martialarts': 0.9715826, 'martialartsmasters..."
3,Military Action,"[-0.03524612635374069, -0.01798408292233944, -...","{'militaryoperation': 0.9026204, 'militaryserv..."
4,Spy and Espionage,"[-0.02443336509168148, -0.0044959532096982, 0....","{'espionage': 0.93842745, 'espionagewar': 0.90..."
5,Superhero,"[-0.013685225509107113, -0.019091086462140083,...","{'superhero': 0.975294, 'superheroes': 0.94837..."
6,Video game movies,"[-0.003580923890694976, -0.03175680711865425, ...","{'movies': 0.87806183, 'movie': 0.8734518, 'fi..."
7,Action comedy,"[-0.0029455029871314764, -0.030893906950950623...","{'actioncomedy': 0.9370334, 'comedy': 0.918591..."
8,Action crime,"[-0.011898484081029892, -0.022693533450365067,...","{'action-packed': 0.8804488, 'actioncomedy': 0..."
9,Action drama,"[-0.009050272405147552, -0.0315282866358757, -...","{'dramaadventure': 0.9091109, 'drama': 0.89400..."


In [30]:
print('53개의 기준 상위 카테고리 중 한개인 :',data.loc[0,'template_words'])

print('GPT가 생성한 단어들 중 Adventure와 비슷한 단어 : ',data.loc[0,'sim_words'])

53개의 기준 상위 카테고리 중 한개인 : Adventure
GPT가 생성한 단어들 중 Adventure와 비슷한 단어 :  {'adventure': 0.9180647, 'adventures': 0.8970845, 'summaryadventure': 0.89348567, 'travel': 0.88680845, 'fiction': 0.88300073, 'adventurer': 0.88083833, 'challenge': 0.88070995, 'adventurous': 0.88049877, 'adventurechildren': 0.87938356, 'dramaadventure': 0.8769151, 'maze': 0.8698836, 'adventurers': 0.8691629, 'mysteryadventure': 0.8671793, 'stories': 0.8657759, 'trip': 0.86537373, 'achievement': 0.8645917, 'earthrealm': 0.8635608, 'games': 0.8634163, 'mission': 0.8625673, 'revolution': 0.8625293, 'activity': 0.8614142, 'wizard': 0.86091614, 'story': 0.8605453, 'creation': 0.8604027, 'globaladventure': 0.86038303, 'movie': 0.85988396, 'peace': 0.85947704, 'legendaryadventure': 0.8594271, 'battle': 0.8594074, 'illusion': 0.85894614, 'missions': 0.85894054, 'movies': 0.8579598, 'experiment': 0.8576087, 'action': 0.85754776, 'sciencefiction': 0.8572767, 'comic': 0.85676706, 'career': 0.85675764, 'tour': 0.8566253, 'hone

In [15]:
# template_A,B,C 칼럼의 값들을 조회하면서 data['sim_words']의 값에 해당하는 칼럼의 값이 있으면 data['template_words']의 값으로 대체하는 코드를 찾을 수 없어 설명으로 대체하겟슴돠.

# 이후 template_A, template_B, template_C 칼럼의 값(단어) 들이
# data 데이터 프레임의 sim_words 칼럼의 값에 존재하는 경우 53개의 카테고리 단어로 대체된다.
# 대체된 값들은 (아래의) vod 데이터 프레임의 TopGroup 칼럼의 값으로 들어간다.

In [27]:
vod = pd.read_csv('../resource/Data_dir/VODs_1223_0003.csv')
vod.head()

Unnamed: 0,title,unique_id,ImgUrl,ct_cl,genre_of_ct_cl,SMRY,ACTR_DISP,disp_rtm,grade,country,release_year,template_A,template_B,template_C,templates,TopGroup,translated_front_view_template,director,prcs_templates
0,빅매치(9월 이벤트),uq_995,https://search.pstatic.net/common?type=o&size=...,영화,액션/어드벤쳐,"흥행 끝판왕 이정재의 2014년 선택. 액션은 특급, 질주는 본능, 유머는 옵션. ...","이정재, 신하균, 이성민, 보아, 김의성, 손호준",1:52,15세 관람가,대한민국,2014.0,"action, intense, thrilling, adrenaline-fueled,...","comedy, action, thriller, kidnapping, survival","action, race, match, kidnapping, fighter","action, intense, thrilling, adrenaline-fueled,...","Adventure, Martial Arts, Military Action, Acti...","액션, 강렬한, 스릴 넘치는",최호,"['action', 'intense', 'thrilling', 'adrenaline..."
1,비정규직 특수요원,uq_994,https://search.pstatic.net/common?type=o&size=...,영화,코미디,대한민국 최고기관들이 보이스피싱에 탈탈 털렸다. 만년 알바 인생 장영실은 35살의 ...,"강예원, 한채아, 남궁민, 조재윤",1:57,15세 관람가,대한민국,2017.0,"suspenseful, humorous, thought-provoking, intr...","voicephishing, budgettheft, temporaryemploymen...","budget, voicephishing, scam, temporaryemployment","suspenseful, humorous, thought-provoking, intr...","Action thriller, Psychological thriller, Actio...","긴장감 넘치는, 유머러스한, 생각을 자극하는",김덕수,"['suspenseful', 'humorous', 'thought-provoking..."
2,비밀의 집,uq_991,https://search.pstatic.net/common?type=f&size=...,TV드라마,기타,본 회차는 방송사의 사정으로 줄거리를 제공하지 않습니다.,"서하준, 이영은, 정헌, 강별, 이승연, 장항선, 윤복인, 박충선, 김난희, 박예린...",0:29,15세이상,,,"mysterious, suspenseful, intense, gripping, ve...","mystery, secrets, fight, lawyer, revenge","secrets, lawyer, missingmother, revenge, silve...","mysterious, suspenseful, intense, gripping, ve...","Legal drama, Occult, Psychological thriller, M...","신비한, 긴장감 넘치는, 강렬한",,"['mysterious', 'suspenseful', 'intense', 'grip..."
3,비밀의 여자,uq_990,https://search.pstatic.net/common?type=f&size=...,TV드라마,기타,"재벌가 유진의 집으로 시집을 온 겨울은 생일날에 미역국 하나 얻어먹지 못하고, 임신...","최윤영, 이채영, 이선호, 한기웅, 신고은, 이은형, 최재성, 방은희, 윤지숙, 임...",0:33,15세이상,,,"tragic, suspenseful, intense, vindictive, emot...","blindness, love, revenge, inheritance, justice","visionloss, husband, revenge, fiance, inheritance","tragic, suspenseful, intense, vindictive, emot...","Psychological thriller, Psychological thriller...","비극적, 긴장감 넘치는, 강렬한",,"['tragic', 'suspenseful', 'intense', 'vindicti..."
4,블리딩 스틸,uq_986,https://search.pstatic.net/common?type=o&size=...,영화,액션/어드벤쳐,"초대형 범죄 사건의 배후에 있던 누군가가 생체병기 최고 권위자의 목숨을 노리고, 그...","성룡, 칼란 멀베이, 테스 호브리치, 나지상, 오우양나나, 다미앙 가베이",1:41,15세 관람가,중국,2018.0,"mysterious, suspenseful, intense, action-packe...","author, bio-weapon, swat, crime, organization","author, bio-weapon, swat, crime, organization","mysterious, suspenseful, intense, action-packe...","Action comedy, Action crime, Action drama, Act...","신비한, 긴장감 넘치는, 강렬한",,"['mysterious', 'suspenseful', 'intense', 'acti..."


# DeepFM 데이터 셋만들기

(전처리 코드가 어딧는지 모르겟네유..)

vod 데이터 프레임에서 TopGroup 칼럼의 값들을 원핫인코딩하여
아래의 DeepFM_Dataset을 만든다

In [16]:
DeepFM_Dataset = pd.read_csv(r'E:\VOD추천 프로젝트 데이터\DATA\DeepFM_DataSet\MakeModelDataSet_user_08.csv')
DeepFM_Dataset

Unnamed: 0,subsr,content_id,liked,ct_cl,genre_of_ct_cl,Adventure,Disaster,Martial Arts,Military Action,Spy and Espionage,...,Fantasy romance,Space Opera or epic sci-fi,Utopia,Dystopia,Contemporary Sci-Fi,Cyberpunk,Steampunk,Psychological thriller,Mystery,Film noir
0,65941000,11,0,TV 시사/교양,기타,0,0,0,0,0,...,0,1,0,1,1,1,1,0,0,0
1,65941000,15,0,TV 연예/오락,기타,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,65941000,17,0,TV 연예/오락,기타,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,65941000,19,0,TV 연예/오락,기타,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,65941000,21,0,TV 연예/오락,기타,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402870,65830000,11192,0,영화,액션/어드벤쳐,1,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
1402871,65830000,11226,0,영화,드라마,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
1402872,65830000,11231,0,영화,공포/스릴러,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,0
1402873,65830000,11232,0,영화,공포/스릴러,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,0
