In [None]:
"""
BERT
이미 학습되어져 있는 모델을 다운로드 해서 사용 가능 -> 이후 파인튜닝(추가학습)후 사용가능

파인튜닝 : 기존의 버트 모델의 변화를 최소화 하면서 분류기로 학습하는것
별도의 레이블이 없는 버트를 레이블이 있는 모델을 이용해 학습하는것

-BERT는 별도의 레이블이 필요없다 -> 다음단어가 정답이므로 텍스트 레이블로 충분히 학습된다

마스크드 언어 모델 : 입력 텍스트의 단어 집합의 15%의 단어를 랜덤으로 마스킹 -> 이후 버트가 마스킹된 단어 예측하도록 함

-BERT는 단어보다 더 작은 단위로 쪼개는 서브워드 토크나이저 사용
자주 등장하는 단어는 단어 집합에 추가, 자주 등장 안하는 단어는 더 작은 단위인 서브워드에 추가

BERT는 3개의 임베딩 층 사용
-WordPiece Embedding: 실질적인 입력이 되는 워드 임베딩 ,단어 집합 크기로 30,522개
-Position Embedding: 위치정보 학습하기 위한 임베딩, 문장 최대 길이인 512개
-Segment Embedding: 두개의 문장을 구분하기 위한 임베딩, 문장 최대 개수인 2개
"""

In [None]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp311-cp311-win_amd64.whl (977 kB)
     ---------------------------------------- 0.0/977.5 kB ? eta -:--:--
     ------------------ ------------------ 481.3/977.5 kB 14.7 MB/s eta 0:00:01
     ------------------------------------  972.8/977.5 kB 20.5 MB/s eta 0:00:01
     ------------------------------------- 977.5/977.5 kB 10.3 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Note: you may need to restart the kernel to use updated packages.


In [None]:
import sentencepiece as spm
import pandas as pd
import urllib.request
import csv

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")

('IMDb_Reviews.csv', <http.client.HTTPMessage at 0x2511296ead0>)

In [None]:
train_df = pd.read_csv('IMDb_Reviews.csv')
train_df['review']


0        My family and I normally do not watch local mo...
1        Believe it or not, this was at one time the wo...
2        After some internet surfing, I found the "Home...
3        One of the most unheralded great works of anim...
4        It was the Sixties, and anyone with long hair ...
                               ...                        
49995    the people who came up with this are SICK AND ...
49996    The script is so so laughable... this in turn,...
49997    "So there's this bride, you see, and she gets ...
49998    Your mind will not be satisfied by this nobud...
49999    The chaser's war on everything is a weekly sho...
Name: review, Length: 50000, dtype: object

In [None]:
with open('imdb_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(train_df['review']))

In [None]:
spm.SentencePieceTrainer.Train('--input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

In [None]:
pd.read_csv('imdb.vocab',sep='\t', header = None, quoting=csv.QUOTE_NONE)

Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,▁t,0
4,▁a,-1
...,...,...
4995,8,-4992
4996,4,-4993
4997,7,-4994
4998,&,-4995


In [None]:
vocab_list = pd.read_csv('imdb.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list.sample(10)

Unnamed: 0,0,1
1805,▁looked,-1802
733,▁app,-730
3188,▁bought,-3185
4964,D,-4961
4473,▁twenty,-4470
3638,▁dad,-3635
2341,▁buy,-2338
3620,budget,-3617
1664,▁foot,-1661
3781,▁****,-3778


In [None]:
sp = spm.SentencePieceProcessor()
vocab_file = "imdb.model"
sp.load(vocab_file)

True

In [None]:
lines = [
  "I didn't at all think of it this way.",
  "I have waited a long time for someone to film"
]

In [None]:
sp.encode_as_pieces(lines[0]) #문장 ->서브 워드로 변환
#sp.encode_as_ids(lines[0]) #문장 -> 정수 코드로 변환

['▁I',
 '▁didn',
 "'",
 't',
 '▁at',
 '▁all',
 '▁think',
 '▁of',
 '▁it',
 '▁this',
 '▁way',
 '.']

In [None]:
sp.PieceToId('▁I')

41

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x2511c6fb8d0>)

In [None]:
naver_df = pd.read_table('ratings.txt')
naver_df[:5]

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [None]:
naver_df = naver_df.dropna(how = 'any') # Null 값이 존재하는 행 제거
print(naver_df.isnull().values.any()) # Null 값이 존재하는지 확인

False


In [None]:
print('리뷰 개수 :',len(naver_df)) # 리뷰 개수 출력

리뷰 개수 : 199992


In [None]:
with open('naver_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(naver_df['document']))

In [None]:
spm.SentencePieceTrainer.Train('--input=naver_review.txt --model_prefix=naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

In [None]:
vocab_list = pd.read_csv('naver.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list[:10]

Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,..,0
4,영화,-1
5,▁영화,-2
6,▁이,-3
7,▁아,-4
8,...,-5
9,ᄏᄏ,-6


In [None]:
sp = spm.SentencePieceProcessor()
vocab_file = "naver.model"
sp.load(vocab_file)

True

In [None]:
lines = [
  "뭐 이딴 것도 영화냐.",
  "진짜 최고의 영화입니다 ㅋㅋ",
]

In [None]:
print(sp.encode_as_pieces(lines[1]))

['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']


In [None]:
sp.DecodePieces(['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ'])

'진짜 최고의 영화입니다 ᄏᄏ'

In [None]:
sp.DecodeIds([54, 200, 821, 85])


'진짜 원 산~~'

In [None]:
print(sp.encode('진짜 최고의 영화입니다 ㅋㅋ', out_type=str))
print(sp.encode('진짜 최고의 영화입니다 ㅋㅋ', out_type=int))

['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']
[54, 204, 825, 121]


In [None]:
import pandas as pd
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Bert-base의 토크나이저

100%|███████████████████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 429901.81B/s]


In [None]:
tokenizer.vocab_size

30522

In [None]:
tokenizer.vocab
tokenizer.vocab['do']

2079

In [None]:
tokenizer.vocab['love']

2293

In [None]:
tokenizer.vocab['loves']

7459

In [None]:
tokenizer.vocab['embeddings']

KeyError: 'embeddings'

In [None]:
#embeddings 단어가 OOV에 해당하는 단어지만 단어를 여러개의 서브워드로 나눠서 처리해 OOV문제가 생기지 않음
tokenizer.vocab['em']
tokenizer.vocab['##bed']
tokenizer.vocab['##ding']
tokenizer.vocab['##s']

2015

In [None]:
tokenizer.tokenize('Here is the sentence I want embeddings for.')

['here',
 'is',
 'the',
 'sentence',
 'i',
 'want',
 'em',
 '##bed',
 '##ding',
 '##s',
 'for',
 '.']

In [None]:
with open('vocabulary.txt', 'w', encoding='utf8') as f:
    for token in tokenizer.vocab.keys():
        f.write(token + '\n')

In [None]:
df = pd.read_fwf('vocabulary.txt', header=None)
df

Unnamed: 0,0
0,[PAD]
1,[unused0]
2,[unused1]
3,[unused2]
4,[unused3]
...,...
30517,##．
30518,##／
30519,##：
30520,##？


In [None]:

"""
[PAD] - 0
[UNK] - 100
[CLS] - 101
[SEP] - 102
[MASK] - 103
"""

In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
import pandas as pd
from transformers import BertTokenizer

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
#embeddings는 OOV가 아닌 작은 단어 단위로 나옴
result = tokenizer.tokenize('Here is the sentence I want embeddings for.')
print(result)

['here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.']


In [5]:
print(tokenizer.vocab['here'])

2182


In [6]:
from transformers import TFBertForMaskedLM

In [7]:
model = TFBertForMaskedLM.from_pretrained('bert-large-uncased')
#BERT를 마스크드 언어 모델 구조로 읽어들인다

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [8]:
from transformers import AutoTokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
#bert-large-uncased 모델이 학습될 당시에 사용된 토크나이저가 읽어진다.

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
inputs = tokenizer.encode_plus('Soccer is a really fun [MASK].', add_special_tokens=True, return_tensors='tf')

In [11]:
inputs

{'input_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=
array([[ 101, 4715, 2003, 1037, 2428, 4569,  103, 1012,  102]],
      dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [12]:
inputs['input_ids']

<tf.Tensor: shape=(1, 9), dtype=int32, numpy=
array([[ 101, 4715, 2003, 1037, 2428, 4569,  103, 1012,  102]],
      dtype=int32)>

In [13]:
inputs['token_type_ids']

<tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>

In [14]:
inputs['attention_mask']

<tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>

In [15]:
from transformers import FillMaskPipeline

In [16]:
pip = FillMaskPipeline(model=model, tokenizer=tokenizer)

In [17]:
pip('Soccer is a really fun [MASK].')

[{'score': 0.7621113061904907,
  'token': 4368,
  'token_str': 'sport',
  'sequence': 'soccer is a really fun sport.'},
 {'score': 0.20342056453227997,
  'token': 2208,
  'token_str': 'game',
  'sequence': 'soccer is a really fun game.'},
 {'score': 0.012208598665893078,
  'token': 2518,
  'token_str': 'thing',
  'sequence': 'soccer is a really fun thing.'},
 {'score': 0.001863026642240584,
  'token': 4023,
  'token_str': 'activity',
  'sequence': 'soccer is a really fun activity.'},
 {'score': 0.001335486420430243,
  'token': 2492,
  'token_str': 'field',
  'sequence': 'soccer is a really fun field.'}]

In [18]:
pip('I went to [MASK] this morning.')

[{'score': 0.3573073446750641,
  'token': 2147,
  'token_str': 'work',
  'sequence': 'i went to work this morning.'},
 {'score': 0.23304398357868195,
  'token': 2793,
  'token_str': 'bed',
  'sequence': 'i went to bed this morning.'},
 {'score': 0.1284506916999817,
  'token': 2082,
  'token_str': 'school',
  'sequence': 'i went to school this morning.'},
 {'score': 0.06230578571557999,
  'token': 3637,
  'token_str': 'sleep',
  'sequence': 'i went to sleep this morning.'},
 {'score': 0.046952586621046066,
  'token': 2465,
  'token_str': 'class',
  'sequence': 'i went to class this morning.'}]

In [19]:
# 한국어 bert

In [20]:
model = TFBertForMaskedLM.from_pretrained('klue/bert-base', from_pt=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForMaskedLM: ['bert.embeddings.position_ids', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing TFBertForMaskedLM from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForMaskedLM from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [21]:
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [24]:
inputs = tokenizer('축구는 정말 재미있는 [MASK]다.', return_tensors='tf')

In [25]:
inputs

{'input_ids': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
array([[   2, 4713, 2259, 3944, 6001, 2259,    4,  809,   18,    3]],
      dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [26]:
pip = FillMaskPipeline(model=model, tokenizer=tokenizer)

In [27]:
pip('축구는 정말 재미있는 [MASK]다.')

[{'score': 0.8963507413864136,
  'token': 4559,
  'token_str': '스포츠',
  'sequence': '축구는 정말 재미있는 스포츠 다.'},
 {'score': 0.025957651436328888,
  'token': 568,
  'token_str': '거',
  'sequence': '축구는 정말 재미있는 거 다.'},
 {'score': 0.01003396138548851,
  'token': 3682,
  'token_str': '경기',
  'sequence': '축구는 정말 재미있는 경기 다.'},
 {'score': 0.007924409583210945,
  'token': 4713,
  'token_str': '축구',
  'sequence': '축구는 정말 재미있는 축구 다.'},
 {'score': 0.007844234816730022,
  'token': 5845,
  'token_str': '놀이',
  'sequence': '축구는 정말 재미있는 놀이 다.'}]

In [29]:
pip('나는 방금 [MASK]를 먹었다.')

[{'score': 0.05272819101810455,
  'token': 10058,
  'token_str': '디저트',
  'sequence': '나는 방금 디저트 를 먹었다.'},
 {'score': 0.05258025601506233,
  'token': 14995,
  'token_str': '햄버거',
  'sequence': '나는 방금 햄버거 를 먹었다.'},
 {'score': 0.05081625655293465,
  'token': 13309,
  'token_str': '샌드위치',
  'sequence': '나는 방금 샌드위치 를 먹었다.'},
 {'score': 0.03278220072388649,
  'token': 11130,
  'token_str': '스테이크',
  'sequence': '나는 방금 스테이크 를 먹었다.'},
 {'score': 0.02911238744854927,
  'token': 8395,
  'token_str': '피자',
  'sequence': '나는 방금 피자 를 먹었다.'}]

In [30]:
from transformers import TFBertForNextSentencePrediction

In [31]:
from transformers import AutoTokenizer

In [32]:
model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForNextSentencePrediction.

All the weights of TFBertForNextSentencePrediction were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForNextSentencePrediction for predictions without further training.


In [33]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [34]:
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "pizza is eaten with the use of a knife and fork. In casual settings, however, it is cut into wedges to be eaten while held in the hand."

In [35]:
encoding = tokenizer(prompt, next_sentence, return_tensors='tf')

In [36]:
encoding

{'input_ids': <tf.Tensor: shape=(1, 58), dtype=int32, numpy=
array([[  101,  1999,  3304,  1010, 10733,  2366,  1999,  5337, 10906,
         1010,  2107,  2004,  2012,  1037,  4825,  1010,  2003,  3591,
         4895, 14540,  6610,  2094,  1012,   102, 10733,  2003,  8828,
         2007,  1996,  2224,  1997,  1037,  5442,  1998,  9292,  1012,
         1999, 10017, 10906,  1010,  2174,  1010,  2009,  2003,  3013,
         2046, 17632,  2015,  2000,  2022,  8828,  2096,  2218,  1999,
         1996,  2192,  1012,   102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 58), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 58), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [62]:
tokenizer.decode(encoding['input_ids'][0])

'[CLS] in italy, pizza served in formal settings, such as at a restaurant, is presented unsliced. [SEP] pizza is eaten with the use of a knife and fork. in casual settings, however, it is cut into wedges to be eaten while held in the hand. [SEP]'

In [40]:
encoding['input_ids']

<tf.Tensor: shape=(1, 58), dtype=int32, numpy=
array([[  101,  1999,  3304,  1010, 10733,  2366,  1999,  5337, 10906,
         1010,  2107,  2004,  2012,  1037,  4825,  1010,  2003,  3591,
         4895, 14540,  6610,  2094,  1012,   102, 10733,  2003,  8828,
         2007,  1996,  2224,  1997,  1037,  5442,  1998,  9292,  1012,
         1999, 10017, 10906,  1010,  2174,  1010,  2009,  2003,  3013,
         2046, 17632,  2015,  2000,  2022,  8828,  2096,  2218,  1999,
         1996,  2192,  1012,   102]], dtype=int32)>

In [41]:
encoding['token_type_ids']

<tf.Tensor: shape=(1, 58), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>

In [58]:
logits=model(encoding['input_ids'], token_type_ids = encoding['token_type_ids'])[0]

In [59]:
import tensorflow as tf

In [60]:
soft=tf.keras.layers.Softmax()
res=soft(logits)

In [61]:
res

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[9.9999714e-01, 2.8381855e-06]], dtype=float32)>

In [57]:
tf.math.argmax(res,axis=-1).numpy()[0]
# 0이면 이어지는 문장
# 1이면 이어지지 않는 문장

0