In [1]:
import pandas, nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import RegexpTokenizer

In [2]:
filename = 'C:/Users/pilot/바탕 화면/2022 재 휘/2022 소모임 LEAP/Doc2Vec/hotel-reviews.csv'
df = pandas.read_csv(filename, sep=',')

In [3]:
df.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [5]:
# RegexpTokenizer를 이용해 호텔 리뷰 문장을 토큰화

def nltk_tokenizer(_wd):
    return RegexpTokenizer(r'\w+').tokenize(_wd.lower())

df['Token_Description'] = df['Description'].apply(nltk_tokenizer)

In [6]:
df.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,Token_Description
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy,"[the, room, was, kind, of, clean, but, had, a,..."
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy,"[i, stayed, at, the, crown, plaza, april, apri..."
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy,"[i, booked, this, hotel, through, hotwire, at,..."
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,"[stayed, here, with, husband, and, sons, on, t..."
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy,"[my, girlfriends, and, i, stayed, here, to, ce..."


In [7]:
df[['User_ID', 'Token_Description']].head()

Unnamed: 0,User_ID,Token_Description
0,id10326,"[the, room, was, kind, of, clean, but, had, a,..."
1,id10327,"[i, stayed, at, the, crown, plaza, april, apri..."
2,id10328,"[i, booked, this, hotel, through, hotwire, at,..."
3,id10329,"[stayed, here, with, husband, and, sons, on, t..."
4,id10330,"[my, girlfriends, and, i, stayed, here, to, ce..."


In [8]:
doc_df = df[['User_ID', 'Token_Description']].values.tolist()

In [9]:
print(doc_df[0])

['id10326', ['the', 'room', 'was', 'kind', 'of', 'clean', 'but', 'had', 'a', 'very', 'strong', 'smell', 'of', 'dogs', 'generally', 'below', 'average', 'but', 'ok', 'for', 'a', 'overnight', 'stay', 'if', 'you', 're', 'not', 'too', 'fussy', 'would', 'consider', 'staying', 'again', 'if', 'the', 'price', 'was', 'right', 'breakfast', 'was', 'free', 'and', 'just', 'about', 'better', 'than', 'nothing']]


In [10]:
# words = 토큰 list
# tags = 문서 ID
tagged_data = [TaggedDocument(words=_d, tags=[uid]) for uid, _d in doc_df]

In [11]:
# Training

max_epochs = 10
model = Doc2Vec(window=10, vector_size=150, alpha=0.025, min_alpha=0.025, min_count=2, dm=1, negative=5, seed=9999)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print(f"iterartion {epoch}")
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    
    model.alpha -= 0.002
    
    model.min_alpha = model.alpha
"""
window: 모델 학습할 때 앞뒤로 보는 단어의 수
size: 벡터 차원의 크기
alpha: learning rate
min_count: 학습에 사용할 최소 단어 빈도 수
dm: 학습방법 1 = PV-DM, 0 = PV-DBOW
negative: Complexity Reduction 방법, negative sampling
max_epochs: 최대 학습 횟수
"""

iterartion 0
iterartion 1
iterartion 2
iterartion 3
iterartion 4
iterartion 5
iterartion 6
iterartion 7
iterartion 8
iterartion 9


'\nwindow: 모델 학습할 때 앞뒤로 보는 단어의 수\nsize: 벡터 차원의 크기\nalpha: learning rate\nmin_count: 학습에 사용할 최소 단어 빈도 수\ndm: 학습방법 1 = PV-DM, 0 = PV-DBOW\nnegative: Complexity Reduction 방법, negative sampling\nmax_epochs: 최대 학습 횟수\n'

In [12]:
model.random.seed(9999)
"""
특정 문서와 유사한 문서를 찾기 위해서는 2단계를 거친다.
1. 문서의 vector화
2. 변환된 vector와 가장 가까운 vector 추출
* infer_vector 사용시 seed값을 주지 않으면 random한값이 seed로 사용되어 값이 계속 변경된다.
* 학습되지 않은 단어를 사용한 문서도 결과가 나온다.
"""

'\n특정 문서와 유사한 문서를 찾기 위해서는 2단계를 거친다.\n1. 문서의 vector화\n2. 변환된 vector와 가장 가까운 vector 추출\n* infer_vector 사용시 seed값을 주지 않으면 random한값이 seed로 사용되어 값이 계속 변경된다.\n* 학습되지 않은 단어를 사용한 문서도 결과가 나온다.\n'

In [13]:
doc_list = 'Rooms were clean'.split(' ')

inferred_vector = model.infer_vector(doc_list)
return_docs = model.docvecs.most_similar(positive=[inferred_vector],topn=5)
for rd in return_docs:
    for des in df[df['User_ID'] == rd[0]]['Description']:
        print(rd[0], rd[1], des)

id32805 0.8009138107299805 Staff was extremely polite and helpful. Rooms were clean.
id33500 0.8007739186286926 Great clean rooms very spacious, clean friendly staff!
id39837 0.7866826057434082 Great place to stay!
id31567 0.7840836048126221 Perfect location and great value.
id22388 0.7829453349113464 The employees were very accommodating and friendly


  return_docs = model.docvecs.most_similar(positive=[inferred_vector],topn=5)
