# Sentiment Classification Using BERT 

情緒分類使用BERT神經網路

    負面:0 正面:1 

    負面:0 正面:1 中立:2

    負面:0 正面:1 中立:2 無情緒:3



# Load model and tokenizer

In [1]:
from transformers import AutoTokenizer, pipeline,BertForSequenceClassification
import torch

In [2]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [3]:
#!pip install transformers

In [4]:
# You can download the best trained model from huggingface
# https://huggingface.co/clhuang
 
# (1) Load model from huggingface
#model = AutoModelForSequenceClassification.from_pretrained("clhuang/albert-sentiment")
# model = BertForSequenceClassification.from_pretrained("clhuang/albert-sentiment", num_labels=2) # specify number of labels

# (2) or Load model from local
best_model = "best-model-v1"  #
# model = AutoModelForSequenceClassification.from_pretrained("./my-best-model").to(device)
model = BertForSequenceClassification.from_pretrained(best_model, num_labels=2).to(device)  # specify number of labels


In [5]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-1

In [6]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(best_model) # or from local
# tokenizer = AutoTokenizer.from_pretrained("clhuang/albert-sentiment") #from huggingface
# tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") # or from hugginfacd bert-base-chinese

In [7]:
len(tokenizer)

21128

In [8]:
# tokenize can encode text to input_ids and decode input_ids to text
tokenizer.get_vocab()

{'萁': 5841,
 '##廊': 15500,
 '##峭': 15347,
 '槛': 3546,
 'want': 12733,
 '##昴': 16280,
 '##新': 16230,
 '幟': 2392,
 '##趣': 19694,
 '##諒': 19372,
 '鵑': 7864,
 '##呻': 14517,
 'hit': 10295,
 'と': 556,
 'dit': 13231,
 '##苓': 18782,
 '峻': 2295,
 '##哉': 14564,
 '##49': 9500,
 '##妒': 15028,
 '##润': 16940,
 '観': 6219,
 '##thing': 12012,
 '##女': 15014,
 '燁': 4233,
 '##启': 14480,
 '刨': 1163,
 '雙': 7427,
 '苯': 5738,
 '輾': 6747,
 '##躯': 19775,
 '##um': 8545,
 '乒': 728,
 '詩': 6276,
 '##菇': 18880,
 '鎬': 7119,
 '##gt': 12429,
 '##蟒': 19154,
 '##橇': 16634,
 '孳': 2117,
 '##竹': 18058,
 '煜': 4207,
 '##驶': 20781,
 '捎': 2933,
 '拭': 2887,
 '廷': 2455,
 '770': 13215,
 '杨': 3342,
 '串': 706,
 '繋': 5250,
 '説': 6304,
 '##ima': 13028,
 '蝈': 6067,
 '量': 7030,
 '钒': 7156,
 '##铝': 20256,
 '婷': 2051,
 '##巽': 15409,
 '##貅': 19561,
 '因': 1728,
 'foundation': 12099,
 '486': 12459,
 '飯': 7613,
 '##js': 11124,
 '##倘': 14008,
 'andy': 11036,
 'eur': 11991,
 'lt': 10413,
 '嬉': 2080,
 'amana': 12406,
 '##cts': 12175,
 '##ny': 86

In [9]:
text="我喜歡"
# prepare our text into tokenized sequence
inputs = tokenizer(text)
inputs

{'input_ids': [101, 2769, 1599, 3631, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [10]:
tokenizer.decode(inputs['input_ids'])

'[CLS] 我 喜 歡 [SEP]'

# Predict or generate result using pipeline

    可能的輸出結果如下:

    [{'label': 'LABEL_1', 'score': 0.9885562062263489}]
    [{'label': 'LABEL_0', 'score': 0.9052111506462097}]

    因此需要用到if去判端label的值，才能決定score是正面還是負面。
    若為:LABEL_1 就是正面的score
    若為:LABEL_0 就是負面的score

In [11]:
sentiment_classify = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [12]:
new_text = '速度很快，昨天下單，今天上午就到啦，看著挺不錯。'
sentiment_classify(new_text)

[{'label': 'LABEL_1', 'score': 0.9873760938644409}]

In [13]:
outputs = sentiment_classify(new_text)
outputs[0]['score']

0.9873760938644409

In [14]:
type(outputs[0]['score'])

float

In [15]:
# Positive probability
round(outputs[0]['score'],2)

0.99

In [16]:
# Negative probability
round(1 - round(outputs[0]['score'],2),2)

0.01

In [17]:
new_text = '不喜歡這款產品'
sentiment_classify(new_text)

[{'label': 'LABEL_0', 'score': 0.8333981037139893}]

# Define prediction function using pipeline

In [18]:
def get_sentiment_proba(text):
    max_length = 300 # 最多字數 若超出模型訓練時的字數，以模型最大字數為依據 
    #max_length = 512 # 最多字數 若超出模型訓練時的字數，以模型最大字數為依據 
    outputs = sentiment_classify(text, padding=True, max_length=max_length, truncation=True)
    if outputs[0]['label']=='LABEL_1':
        # Get the positive score
        prob_positive = round(outputs[0]['score'],2)
        prob_negatitive = round(1 - prob_positive, 2)
    else:    
        # Calculate the negative score
        prob_negatitive = round(outputs[0]['score'],2)
        prob_positive = round(1 - prob_negatitive, 2)

    response = {'Negative':prob_negatitive, 'Positive': prob_positive}
    return response

In [19]:
new_text = '速度很快，昨天下單，今天上午就到啦，看著挺不錯。'
get_sentiment_proba( new_text )

{'Negative': 0.01, 'Positive': 0.99}

In [20]:
new_text = '已經買了這種蘋果好多次了，寶寶喜歡上了這款蘋果，一直選擇這款'
get_sentiment_proba( new_text )

{'Negative': 0.02, 'Positive': 0.98}

In [21]:
new_text = '不喜歡這款產品'

get_sentiment_proba( new_text )

{'Negative': 0.83, 'Positive': 0.17}

In [22]:
new_text = '非常不喜歡這款產品'

get_sentiment_proba( new_text )

{'Negative': 0.18, 'Positive': 0.82}

# Define prediction function using model or model.generate()

In [23]:
## Pediction
target_names=['Negative','Positive']
max_length = 200 # 最多字數 若超出模型訓練時的字數，以模型最大字數為依據 
def get_sentiment_proba_from_model(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)

    response = {'Negative': round(float(probs[0, 0]), 2), 'Positive': round(float(probs[0, 1]), 2)}
    # executing argmax function to get the candidate label
    #return probs.argmax()
    return response

In [24]:
new_text = '不喜歡這款產品'

get_sentiment_proba_from_model( new_text )

{'Negative': 0.83, 'Positive': 0.17}

# Pediction模型使用

## label <--> id

In [25]:
# Map labels to integers
categories=['負面','正面']

In [26]:

label_to_id = { cate : i for i, cate in enumerate(categories)}

In [27]:
label_to_id

{'負面': 0, '正面': 1}

In [28]:
id_to_label = { i : cate for i, cate in enumerate(categories)}

In [29]:
id_to_label

{0: '負面', 1: '正面'}

In [30]:

# Function to make predictions
def predict_sentiment(text, model, tokenizer, device):
    max_length = 512 # 最多字數 若超出模型訓練時的字數，以模型最大字數為依據 
    # Tokenize the input text
    inputs = tokenizer(
        text,
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    ).to(device)
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract logits and apply softmax to get probabilities
    # logits = outputs.logits
    logits = outputs["logits"]  # 取出 logits
    
    
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Get the predicted class (0: negative, 1: positive)
    predicted_class = torch.argmax(probabilities, dim=-1).item()
    
    # Get the class name using id_to_label
    predicted_label = id_to_label[predicted_class]
    
    # Get the confidence score
    confidence = probabilities[0][predicted_class].item()
    
    return {
        "text": text,
        "sentiment": predicted_label,
        "confidence": round(confidence,2),
        "probabilities": {
            id_to_label[i]: round(prob.item(),2) for i, prob in enumerate(probabilities[0])
        }
    }


In [31]:
text = "今天天氣真好，我很開心"
predict_sentiment(text, model, tokenizer, device)

{'text': '今天天氣真好，我很開心',
 'sentiment': '正面',
 'confidence': 0.97,
 'probabilities': {'負面': 0.03, '正面': 0.97}}

In [32]:
text = "這個產品品質差，服務更糟糕"
predict_sentiment(text, model, tokenizer, device)

{'text': '這個產品品質差，服務更糟糕',
 'sentiment': '負面',
 'confidence': 0.98,
 'probabilities': {'負面': 0.98, '正面': 0.02}}

In [33]:
text = "這家餐廳的食物美味，環境也很舒適"
predict_sentiment(text, model, tokenizer, device)

{'text': '這家餐廳的食物美味，環境也很舒適',
 'sentiment': '正面',
 'confidence': 0.99,
 'probabilities': {'負面': 0.01, '正面': 0.99}}