# 영어 ELECTRA를_이용한_감정분석기_학습 (Pytorch + HuggingFace)
# Colab 에서 개발 및 실행
# Github : https://github.com/MunJinSeo/MyProject/
<br>

## References 1
- 김희규님의 "HuggingFace KoElectra로 NSMC 감성분석 Fine-tuning해보기"<br>
https://heegyukim.medium.com/huggingface-koelectra%EB%A1%9C-nsmc-%EA%B0%90%EC%84%B1%EB%B6%84%EB%A5%98%EB%AA%A8%EB%8D%B8%ED%95%99%EC%8A%B5%ED%95%98%EA%B8%B0-1a23a0c704af

- 이지원님의 Github : nlp_emotion_classification <br>
https://github.com/jiwonny/nlp_emotion_classification

## 사용모델 ELECTRA
- 한국어 : 박장원님의 KoElectra-small 사용<br>
https://monologg.kr/2020/05/02/koelectra-part1/<br>
https://github.com/monologg/KoELECTRA
- 영어 : 구글 ELECTRA - small 사용<br>
https://huggingface.co/google/electra-small-discriminator<br>
https://github.com/google-research/electra

## Dataset
- 한국어 : 네이버 영화 리뷰 데이터셋<br>
https://github.com/e9t/nsmc
- 영어 : Freinds <br>
http://doraemon.iis.sinica.edu.tw/emotionlines/

## References 2
- https://colab.research.google.com/drive/1tIf0Ugdqg4qT7gcxia3tL7und64Rv1dP
- https://blog.naver.com/horajjan/221739630055
<br>@@<br>
- https://github.com/YongWookHa/kor-text-preprocess
- https://github.com/likejazz/korean-sentence-splitter
- https://github.com/lovit/soynlp
<br>@@<br>
- https://huggingface.co/transformers/training.html
- https://tutorials.pytorch.kr/beginner/data_loading_tutorial.html
- https://tutorials.pytorch.kr/beginner/blitz/cifar10_tutorial.html
- https://wikidocs.net/44249


## 기타
반드시 GPU로 실행 - Colab무료환경에서 1epoch 당 약 15~20분 소요

# 필요 lib 설치

In [87]:
# lib 설치
!pip install transformers
!pip install torch



In [88]:
!pip install kss
!pip install konlpy
!pip install sentencepiece
!pip install soynlp



In [89]:
# (미사용) Colab TPU 사용을 위해 설치
#--!pip install torch_xla
#--#@param ["1.5" , "20200325", "nightly"]
#VERSION = "1.7"
#!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
#!python pytorch-xla-env-setup.py --version $VERSION

# Friends 데이터셋 처리

In [90]:
#Friends 데이터셋 처리
#zip파일로 Colab에 직접 올린후 압축해제
!unzip friends_json.zip


Archive:  friends_json.zip
replace friends_dev.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [91]:
!head friends_train.json
!head friends_test.json
!head friends_dev.json
# !head en_data.csv #별도 kaggle에서 받아서 사용 https://www.kaggle.com/c/english-sa-competition-bdc101

[
    [
        {
            "speaker": "Chandler",
            "utterance": "also I was the point person on my company\u0092s transition from the KL-5 to GR-6 system.",
            "emotion": "neutral",
            "annotation": "4100000"
        },
        {
            "speaker": "The Interviewer",
[
    [
        {
            "speaker": "Mark",
            "utterance": "Why do all you\u0092re coffee mugs have numbers on the bottom?",
            "emotion": "surprise",
            "annotation": "2000030"
        },
        {
            "speaker": "Rachel",
[
    [
        {
            "speaker": "Phoebe",
            "utterance": "Oh my God, he\u0092s lost it. He\u0092s totally lost it.",
            "emotion": "non-neutral",
            "annotation": "0002120"
        },
        {
            "speaker": "Monica",


# 필요 모듈 import

In [92]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

In [93]:
# (미사용) TPU 사용을 위해 필요
#import torch_xla
#import torch_xla.core.xla_model as xm

In [202]:
import json
import numpy as np

In [95]:
#import kss
import re
#from soynlp.normalizer import *

In [96]:
# GPU or CPU
if torch.cuda.is_available():
  device = torch.device("cuda")
  print('There are %d GPU(s) available.' % torch.cuda.device_count())
  print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
  device = torch.device("cpu")
  print('No GPU available, using the CPU instead.')

# (미사용) TPU
#device = xm.xla_device()

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


# 데이터셋 처리 (Dataset Calss / 전처리)
(train data와 과제 sample data형식이 다르고, encoding이 다르기 때문에 분리 처리)

In [287]:
class Friends_Dataset(Dataset):
  # Json파일을 DataFrame으로 변환 함수
  def jsonToDataFrame(self, file_name):
    with open(file_name, encoding = 'utf-8', mode = 'r') as file:
      json_array = json.load(file)

    result = pd.DataFrame.from_dict(json_array[0])
   
    is_first = True
    for array in json_array:
      if is_first:
        is_first = False
        continue
      
      temp_df = pd.DataFrame.from_dict(array)
      result = result.append(temp_df, ignore_index = True)

    return result

  def __init__(self, csv_file, ftype):
    # train data와 sample data 각각 처리

    # 초기 전처리 1, 아래쪽 전처리 2 로 clean_text() 펑션 분리함 (학습 할때 line별 전처리 후 사용됨)
    # --- 전처리 1 start ------------------------
    self.emotion_dic = {'neutral':0,'surprise':1,'fear':2,'non-neutral':3,'joy':4,'sadness':5,'anger':6,'disgust':7}

    if ftype == 'train':
      #1 speaker, 2 utterance(표현문장), 3 emotion(감정분류값), 4 annotation
      self.dataset = self.jsonToDataFrame(file_name=csv_file) #.dropna(axis=0)

    elif ftype == 'sample':
      #1 id, 2 i_dialog, 3 i_utterance, 4 speaker, 5 utterance
      #self.dataset = pd.read_csv(csv_file, sep=',', encoding = 'unicode_escape')
      self.dataset = pd.read_csv(csv_file, sep=',', encoding = 'utf-8')
      # 뒤쪽 컬럼 label 값이 없으므로 기본값으로 추가함
      #0 idx, 1 id, 2 i_dialog, 3 i_utterance, 4 speaker, 5 utterance(표현문장),6 emotion, 7 Predicted 
      self.dataset.insert(5,'emotion',0) # 기본값 셋팅
      self.dataset.insert(6,'Predicted',0) # 기본값 셋팅
      #print(self.dataset)
    else:
      self.dataset = pd.read_csv(csv_file, sep=',')


    # for idx, document in self.dataset.iterrows():
    #   print(idx, document)
    self.sub1 = re.compile('[^ .?!/@$%~|0-9|a-zㅣA-Z]+') # 영어와 띄어쓰기, 특수기호 일부를 제외한 모든 글자제거
    self.sub2 = re.compile('[\s]+')  # white space duplicate
    self.sub3 = re.compile('[\.]+')  # full stop duplicate

    #중복되는 문장 제거
    if ftype == 'sample':
      print('과제 제출용은 중복 문장 제거하면 안됨')
    else:
      # 중복제거: utterance가 텍스트 내용의 title명이다
      self.dataset.drop_duplicates(subset=['utterance'], inplace=True)

    # dataset 확인
    print(self.dataset.describe())
    print(self.dataset)

    # tokenizer
    self.tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
    
    # --- 전처리 1 end ------------------------
  
  def __len__(self):
    return len(self.dataset)
  
  def clean_text(self, txt):
    # --- 전처리 2 start ----------------------
    cleaned = self.sub1.sub('', txt.strip())  # .strip()은 문장의 앞뒤 공백제거함
    cleaned = self.sub2.sub(' ', cleaned)
    cleaned = self.sub3.sub('.', cleaned)
    #cleaned = emoticon_normalize(cleaned, num_repeats=3) # 감정 반복 단순화
    #cleaned = repeat_normalize(cleaned, num_repeats=2) # 중복 글자 단순화
    #cleaned = only_text(cleaned) # text만 추출
    #cleaned = only_hangle(cleaned) # 한글만 추출
    #cleaned = only_hangle_number(cleaned) # 한글/숫자만 추출

    #if len(ssStr) > 1:
    cleaned = "[CLS] " + cleaned + " [SEP]"
    # --- 전처리 2 end ------------------------
    return cleaned

  
  def __getitem__(self, idx):
    # 행번호별 컬럼 지정하여 할당: 0 idx, 1 speaker, 2 utterance, 3 emotion, 4 annotation
    #row = self.dataset.iloc[idx, 1:4].values  # idx번째 행의 첫번째 컬럼 0을 제외하고 1~3컬럼
    #print(row[0], row[1])
    #text = self.clean_text( txt= str(row[0]) ) # 전처리 2 : clean_text()
    #y = row[1] #숫자만 가능함 

    text = self.dataset['utterance'][idx]
    tmp_y = self.dataset['emotion'][idx]
    # emotion 값을 숫자로 치환
    y = 0
    if tmp_y in self.emotion_dic.keys() :
      y = self.emotion_dic[tmp_y]
    else :
      y = 0
    #print("[%s][%d]" % (tmp_y,y) )

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [288]:
train_dataset = Friends_Dataset("friends_train.json","train")
train2_dataset = Friends_Dataset("friends_test.json","train")

#많은 데이터를 학습하기 위해 합쳐서 진행
train_dataset.dataset = train_dataset.dataset.append(train2_dataset.dataset, ignore_index = True)
#print("train_dataset.dataset==ALL=\n", train_dataset.dataset)
#print("train_dataset.dataset.head()==\n", train_dataset.dataset.head())

test_dataset = Friends_Dataset("friends_dev.json","train")

sample_dataset = Friends_Dataset("en_data.csv","sample")
#print("sample_dataset.dataset.head()==\n", sample_dataset.dataset.head())

       speaker utterance  emotion annotation
count     9291      9291     9291       9291
unique     259      9291        8        412
top       Joey       No.  neutral    5000000
freq      1395         1     4116       1173
               speaker  ... annotation
0             Chandler  ...    4100000
1      The Interviewer  ...    5000000
2             Chandler  ...    5000000
3      The Interviewer  ...    5000000
4             Chandler  ...    2000030
...                ...  ...        ...
10555             Joey  ...    2000003
10556         Chandler  ...    3000011
10557             Ross  ...    2100011
10558             Joey  ...    0000050
10560             Joey  ...    1200020

[9291 rows x 4 columns]
       speaker  ... annotation
count     2505  ...       2505
unique      98  ...        311
top       Joey  ...    5000000
freq       388  ...        328

[4 rows x 4 columns]
       speaker  ... annotation
0         Mark  ...    2000030
1       Rachel  ...    2100011
2       Rach

In [None]:
# tmpstr = 'Come on.  Hello?  I?m sorry you have the wrong number.   Okay, I?ll call you later dad. I love you.'
# print( train_dataset.clean_text( txt = tmpstr) )

In [189]:
# test_emotion = 'joy'
# if test_emotion in train_dataset.emotion_dic.keys() :
#   print( train_dataset.emotion_dic[test_emotion] )
# else :
#   print(0)

4


In [None]:
# print( train_dataset.__getitem__(11790) )

In [None]:
# print( sample_dataset.__getitem__(10) )

# 모델 생성 (Create Model)

In [192]:
model = ElectraForSequenceClassification.from_pretrained("google/electra-small-discriminator", num_labels=8).to(device)
#model = ElectraForSequenceClassification.from_pretrained('google/electra-small-generator', num_labels=8)
#model.cuda()

# 한번 실행해보기
#text, attention_mask, y = train_dataset[0]
#model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

In [103]:
try:
  model.load_state_dict(torch.load("model.pt"))
except:
  print("error - model.load_state_dict(torch.load('model.pt'))")
else:
  print("success - model.load_state_dict(torch.load('model.pt'))")

error - model.load_state_dict(torch.load('model.pt'))


In [None]:
# 모델 레이어 보기
model

# 학습(Learn) 하기

In [291]:
# google/electra-small-discriminator
epochs = 4
batch_size = 64

# google/electra-base-discriminator
#epochs = 4
#batch_size = 32

In [292]:
optimizer = AdamW(model.parameters(), 
                  lr=1e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                  )
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [293]:
losses = []
accuracies = []

# 그래디언트 초기화
# model.zero_grad()

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  # 훈련모드로 변경
  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward() # Backward 수행으로 그래디언트 계산
    #xm.optimizer_step(optimizer, barrier=True)  # TPU 사용시 코드
    optimizer.step() # 그래디언트를 통해 가중치 파라미터 업데이트
    #model.zero_grad() # 그래디언트 초기화

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    #print("y_batch====\n",y_batch)
    #print("predicted====\n",predicted)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

HBox(children=(FloatProgress(value=0.0, max=185.0), HTML(value='')))



Batch Loss: 113.64571034908295 Accuracy: tensor(0.6167, device='cuda:0')

Train Loss: 209.18016183376312 Accuracy: tensor(0.6183, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=185.0), HTML(value='')))

Batch Loss: 110.89646399021149 Accuracy: tensor(0.6248, device='cuda:0')

Train Loss: 204.48053300380707 Accuracy: tensor(0.6256, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=185.0), HTML(value='')))

Batch Loss: 107.55588895082474 Accuracy: tensor(0.6366, device='cuda:0')

Train Loss: 199.7199546098709 Accuracy: tensor(0.6367, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=185.0), HTML(value='')))

Batch Loss: 105.25061351060867 Accuracy: tensor(0.6391, device='cuda:0')

Train Loss: 195.58061522245407 Accuracy: tensor(0.6398, device='cuda:0')


In [246]:
losses, accuracies

([232.46743470430374, 225.74113112688065, 220.520762860775, 214.1363224387169],
 [tensor(0.5878, device='cuda:0'),
  tensor(0.5933, device='cuda:0'),
  tensor(0.5985, device='cuda:0'),
  tensor(0.6098, device='cuda:0')])

# 테스트 데이터셋 정확도 확인하기

In [294]:
# 평가모드로 변경
model.eval()

#test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

HBox(children=(FloatProgress(value=0.0, max=369.0), HTML(value='')))




Accuracy: tensor(0.6896, device='cuda:0')


# 모델 저장하기

In [295]:
# 모델 저장하기
torch.save(model.state_dict(), "model.pt")

# 과제용 데이터 예측 및 맵핑

In [296]:
#과제용 데이터 예측
# 데이터 로딩
batchSize = 16
sample_loader = DataLoader(sample_dataset, batch_size=batchSize, shuffle=False)

sample_result = sample_dataset.dataset.copy(deep=True)
print(sample_result)

#평가모드로 변경
model.eval()

idx_s = 0
idx_e = 0

emotion_strDic = {0:'neutral',1:'surprise',2:'fear',3:'non-neutral',4:'joy',5:'sadness',6:'anger',7:'disgust'}

for input_ids_batch, attention_masks_batch, y_batch in tqdm(sample_loader):
  #y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)

  rsList = list(map(int, predicted)) # 결과를 한번에 저장하기 위해 LIST로 변환 처리
  global idx_s, idx_e
  idx_e += len(rsList) #해당 배치구간내에 index 끝값 계산
  #print("index==", idx_s, idx_e)
  
  sample_result['emotion'][idx_s : idx_e] = rsList
  rsPred = [emotion_strDic[xx] for xx in rsList] # 숫자를 감정문자로 치환
  sample_result['Predicted'][idx_s : idx_e] = rsPred  #배치구간을 한번에 업데이트
  idx_s += len(rsList) #해당 배치구간내에 index 시작값은 윗줄 처리 후 증가

print(sample_result)


        id  i_dialog  ...  emotion Predicted
0        0         0  ...        0         0
1        1         0  ...        0         0
2        2         0  ...        0         0
3        3         0  ...        0         0
4        4         0  ...        0         0
...    ...       ...  ...      ...       ...
1618  1618       150  ...        0         0
1619  1619       150  ...        0         0
1620  1620       150  ...        0         0
1621  1621       150  ...        0         0
1622  1622       150  ...        0         0

[1623 rows x 7 columns]


HBox(children=(FloatProgress(value=0.0, max=102.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



        id  i_dialog  ...  emotion    Predicted
0        0         0  ...        0      neutral
1        1         0  ...        4          joy
2        2         0  ...        3  non-neutral
3        3         0  ...        0      neutral
4        4         0  ...        3  non-neutral
...    ...       ...  ...      ...          ...
1618  1618       150  ...        0      neutral
1619  1619       150  ...        4          joy
1620  1620       150  ...        0      neutral
1621  1621       150  ...        0      neutral
1622  1622       150  ...        3  non-neutral

[1623 rows x 7 columns]


In [None]:
#torch.cuda.empty_cache() #GPU 캐쉬 데이터 삭제

# 결과 파일 저장

In [297]:
# 주어진 데이터의 결과를 파일로 저장
#sample_csv = sample_result.to_csv('sample.csv')
sample_csv = sample_result.to_csv('sample.csv',sep=',',na_rep='NaN', columns=['id','Predicted'],index=False)

In [298]:
# 파일을 PC로 다운로드 하기
from google.colab import files
files.download('sample.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>