In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [3]:
df.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [4]:
df[0].shape # only reviews

(6920,)

In [5]:
df1 = df[:2000]

In [6]:
df1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

In [7]:
# Load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
def encode(sents, tokenizer):
    input_ids = [] # 각 문서를 구성하는 토큰의 ID 정보를 저장하기 위한 리스트
    attention_masks = [] # 각 문서의 어텐션 마스트 정보를 저장하기 위한 리스트
    for text in sents:
        tokenized_text = tokenizer.encode_plus(text,
                                            max_length=30, # 문서의 길이를 30으로 통일
                                            add_special_tokens = True, #[CLS]와 [SEP] 토큰을 추가
                                            pad_to_max_length=True, 
#                                             padding_side='right',
                                            return_attention_mask=True)
        input_ids.append(tokenized_text['input_ids'])
        attention_masks.append(tokenized_text['attention_mask'])
    
    return tf.convert_to_tensor(input_ids, dtype=tf.int32), tf.convert_to_tensor(attention_masks, dtype=tf.int32)

In [9]:
tokenized_sents = encode(df1[0], tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
df1[0][0]

'a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films'

In [10]:
# 첫번째 영화평에 대한 토큰 ID 정보 확인하기
tokenized_sents[0][0]

<tf.Tensor: shape=(30,), dtype=int32, numpy=
array([  101,  1037, 18385,  1010,  6057,  1998,  2633, 18276,  2128,
       16603,  1997,  5053,  1998,  1996,  6841,  1998,  5687,  5469,
        3152,   102,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0])>

In [11]:
# 첫번째 영화평에 대한 어텐션 마스크 확인하기
tokenized_sents[1][0]

<tf.Tensor: shape=(30,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])>

In [12]:
model = TFBertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [13]:
for layer in model.layers:
    layer.trainable=False

In [14]:
# with tf.device('/GPU:0'):
outputs = model(tokenized_sents[0], attention_mask = tokenized_sents[1])

In [15]:
len(outputs)

3

In [16]:
hidden_states = outputs[2]
len(hidden_states)

13

In [17]:
hidden_states[-1]
hidden_states[-1].shape

TensorShape([2000, 30, 768])

In [18]:
features = hidden_states[-1][:,0,:].numpy()

In [19]:
features.shape

(2000, 768)

In [20]:
features

array([[-0.55664796, -0.33129135, -0.22280511, ..., -0.22786085,
         0.6319185 ,  0.24306686],
       [-0.2878921 , -0.14285454, -0.06857879, ..., -0.31690514,
         0.18455297,  0.31989914],
       [-0.16481759,  0.4045446 , -0.28772342, ..., -0.08056694,
         0.69961447,  0.6175583 ],
       ...,
       [-0.7284259 , -0.09083428, -0.12269002, ...,  0.11295944,
         0.38278916,  0.7714774 ],
       [-0.03087676,  0.2099947 , -0.13776463, ..., -0.30588874,
         0.37153795,  0.54379797],
       [ 0.15942754, -0.05995868, -0.02413659, ..., -0.2717196 ,
         0.40312117,  0.32209644]], dtype=float32)

In [21]:
labels = df1[1]

In [22]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=0)

In [23]:
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=1000)
lr2.fit(train_features, train_labels)
pred_labels = lr2.predict(test_features)

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, pred_labels)

0.825