In [1]:
import sentencepiece as spm
import pandas as pd
import urllib.request
import csv

In [2]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="data/IMDb_Reviews.csv")

('data/IMDb_Reviews.csv', <http.client.HTTPMessage at 0x7fea8857a190>)

In [4]:
train_df = pd.read_csv("data/IMDb_Reviews.csv")
train_df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [6]:
train_df.shape

(50000, 2)

In [7]:
with open('data/imdb_review.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(train_df['review']))

In [9]:
"""
input : 학습시킬 파일
model_prefix : 만들어질 모델 이름
vocab_size : 단어 집합의 크기
model_type : 사용할 모델 (unigram(default), bpe, char, word)
max_sentence_length: 문장의 최대 길이
pad_id, pad_piece: pad token id, 값
unk_id, unk_piece: unknown token id, 값
bos_id, bos_piece: begin of sentence token id, 값
eos_id, eos_piece: end of sequence token id, 값
user_defined_symbols: 사용자 정의 토큰

"""

'\ninput : 학습시킬 파일\nmodel_prefix : 만들어질 모델 이름\nvocab_size : 단어 집합의 크기\nmodel_type : 사용할 모델 (unigram(default), bpe, char, word)\nmax_sentence_length: 문장의 최대 길이\npad_id, pad_piece: pad token id, 값\nunk_id, unk_piece: unknown token id, 값\nbos_id, bos_piece: begin of sentence token id, 값\neos_id, eos_piece: end of sequence token id, 값\nuser_defined_symbols: 사용자 정의 토큰\n\n'

In [8]:
spm.SentencePieceTrainer.Train('--input=data/imdb_review.txt --model_prefix=model/imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

In [13]:
vocab_list = pd.read_csv('model/imdb.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)

In [14]:
vocab_list.head(10)

Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,▁t,0
4,▁a,-1
5,he,-2
6,in,-3
7,▁the,-4
8,▁s,-5
9,re,-6


In [15]:
vocab_list.shape

(5000, 2)

In [16]:
sp = spm.SentencePieceProcessor()
vocab_file = 'model/imdb.model'
sp.load(vocab_file)

True

In [17]:
lines = [
  "I didn't at all think of it this way.",
  "I have waited a long time for someone to film"
]

for line in lines:
    print(line)
    print(sp.encode_as_pieces(line))
    print(sp.encode_as_ids(line))
    print()

I didn't at all think of it this way.
['▁I', '▁didn', "'", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']
[41, 623, 4950, 4926, 138, 169, 378, 30, 58, 73, 413, 4945]

I have waited a long time for someone to film
['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]



In [18]:
sp.GetPieceSize()

5000

In [19]:
sp.IdToPiece(430)

'▁character'

In [20]:
sp.PieceToId('▁character')

430

In [21]:
sp.DecodeIds([41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91])

'I have waited a long time for someone to film'

In [22]:
print(sp.encode('I have waited a long time for someone to film', out_type=str))
print(sp.encode('I have waited a long time for someone to film', out_type=int))

['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]


In [23]:
#네이버 영화 리뷰 토큰화하기
import pandas as pd
import sentencepiece as spm
import urllib.request
import csv

In [25]:
naver_df = pd.read_table('data/ratings.txt')
naver_df.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [26]:
naver_df.shape

(200000, 3)

In [27]:
naver_df.isnull().sum()

id          0
document    8
label       0
dtype: int64

In [28]:
naver_df = naver_df.dropna(how='any')
print(naver_df.isnull().sum())
print(naver_df.shape)

id          0
document    0
label       0
dtype: int64
(199992, 3)


In [29]:
with open('data/naver_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(naver_df['document']))

In [30]:
spm.SentencePieceTrainer.Train('--input=data/naver_review.txt --model_prefix=model/naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

In [33]:
vocab_list = pd.read_csv('model/naver.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list.head()

Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,..,0
4,영화,-1


In [34]:
vocab_list.shape

(5000, 2)

In [35]:
sp = spm.SentencePieceProcessor()
vocab_file = 'model/naver.model'
sp.load(vocab_file)

True

In [36]:
lines = ["뭐 이딴 것도 영화냐", "진짜 최고의 영화입니다."]

for line in lines:
    print(line)
    print(sp.encode_as_pieces(line))
    print(sp.encode_as_ids(line))
    print()

뭐 이딴 것도 영화냐
['▁뭐', '▁이딴', '▁것도', '▁영화냐']
[132, 966, 1296, 2590]

진짜 최고의 영화입니다.
['▁진짜', '▁최고의', '▁영화입니다', '.']
[54, 200, 821, 3276]



In [37]:
sp.GetPieceSize()

5000

In [38]:
sp.IdToPiece(4)

'영화'

In [39]:
sp.PieceToId('영화')

4

In [40]:
sp.DecodeIds([54, 200, 821, 85])

'진짜 최고의 영화입니다 ᄏᄏ'

In [41]:
sp.DecodePieces(['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ'])

'진짜 최고의 영화입니다 ᄏᄏ'

In [42]:
print(sp.encode('진짜 최고의 영화입니다 ㅋㅋ', out_type=str))
print(sp.encode('진짜 최고의 영화입니다 ㅋㅋ', out_type=int))

['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']
[54, 200, 821, 85]
