# Predict google map review dataset 
## model
- kcbert
- fine-tuned with naver shopping review dataset (200,000개)
- train 5 epochs
- 0.97 accuracy

## dataset
- google map review of tourist places in Daejeon, Korea 

In [15]:
import torch
from torch import nn, Tensor
from torch.optim import Optimizer
from torch.utils.data import DataLoader, RandomSampler, DistributedSampler, random_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss
from pytorch_lightning.core.lightning import LightningModule 
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.metrics.functional import accuracy, precision, recall
from transformers import AdamW, BertForSequenceClassification, AdamW, BertConfig, AutoTokenizer, BertTokenizer, TrainingArguments
from keras.preprocessing.sequence import pad_sequences


import random
import numpy as np 

import time
import datetime
import pandas as pd
import os
from tqdm import tqdm


import pandas as pd
from transformers import AutoTokenizer, AutoModelWithLMHead
from keras.preprocessing.sequence import pad_sequences

In [16]:
if torch.cuda.is_available():    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 2070


In [17]:
pj_path = os.getenv('HOME') + '/Projects/JeongCheck'
data_path = pj_path + '/compare'

In [18]:
data_list = os.listdir(data_path)
print(len(data_list))
data_list

2


['google_reviews_labeled_spacing_8421.csv',
 'google_reviews_labeled_spellcheck_8420.csv']

In [19]:
file_list = os.listdir(data_path)
file_list

['google_reviews_labeled_spacing_8421.csv',
 'google_reviews_labeled_spellcheck_8420.csv']

In [20]:
spacing = pd.read_csv(data_path + f'/{file_list[0]}')
spell = pd.read_csv(data_path + f'/{file_list[1]}')

In [21]:
spacing.head()

Unnamed: 0,name,ratings,date,comment,search,keyword,label
0,CoffeeSquare커피광장,4,2일 전,대전시민을 위한 최대 규모의 공연 시설물,대전예술의전당,art_culture_complex,1
1,So Young Lee,5,4일 전,코로나 때문에 너무 오랜만에 가본 예술의 전당이였고 공연도 너무 좋았습니다 직원 분...,대전예술의전당,art_culture_complex,1
2,김성수,4,1주 전,시설이 깨끗하고 음향이 좋은 곳입니다,대전예술의전당,art_culture_complex,1
3,S.K LEE,5,2주 전,굿 클래식 코로나인데도 연주자들 안전하게 무대에 올려 주셔서 기획자님께 고맙다고 얘...,대전예술의전당,art_culture_complex,1
4,지성구,4,2주 전,굿,대전예술의전당,art_culture_complex,1


In [22]:
spell.head()

Unnamed: 0,name,ratings,date,comment,search,keyword,label
0,박성균,3,1년 전,맥주축제,세계엑스포기념품박물관,expo_science_park,0
1,Алексе́й,5,2년 전,역대 엑스포 개최 도시의 기념품과 희귀한 아이템을 한곳에서 볼 수 있습니다,세계엑스포기념품박물관,expo_science_park,1
2,Robert Helvie,3,4년 전,남북한에 관한 흥미로운 것들 거기에서 많은 시간을 보내지 않아도 됩니다 옆집 과학 ...,세계엑스포기념품박물관,expo_science_park,1
3,hyung-jun kjun,1,5년 전,엑스포는 없고 너무 지루했어,세계엑스포기념품박물관,expo_science_park,0
4,Onur Ozsoy,5,1년 전,주말에는 대개 혼잡합니다 야외 콘서트 및 엔터테인먼트 길거리 음식 작은 기념품은 지...,엑스포음악분수,expo_science_park,1


In [23]:
len(spacing), len(spell)

(8421, 8420)

In [24]:
print(spacing.isna().sum())
print('\n')
print(spell.isna().sum())

name       0
ratings    0
date       0
comment    0
search     0
keyword    0
label      0
dtype: int64


name       0
ratings    0
date       0
comment    0
search     0
keyword    0
label      0
dtype: int64


In [25]:
print(set(spacing.label))
print(set(spell.label))

{0, 1, 2}
{0, 1, 2}


In [26]:
print(len(spacing[spacing.label==2]))
print(len(spell[spell.label==2]))

32
34


In [27]:
test_spac = spacing.copy()
test_spel = spell.copy()

print(len(test_spac), len(test_spel))

8421 8420


중립 데이터 제외

In [28]:
test_spac = test_spac[test_spac.label != 2]
print(len(test_spac))

8389


In [29]:
test_spel = test_spel[test_spel.label != 2]
print(len(test_spel))

8386


In [30]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")

In [31]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    pj_path + "/bert_model/checkpoint-2000",
    num_labels = 2, 
                    
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

In [32]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30000, 768)
bert.embeddings.position_embeddings.weight                (300, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [33]:
def convert_input_data(sentences):

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    MAX_LEN = 64

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
        
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [34]:
def test_sentences(sentences):
 
    # 평가모드로 변경!!!!!
    model.eval()

    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

In [35]:
device = "cuda:0"
model = model.to(device)

## 데이터 변환

In [36]:
def preprocessing(df):

    df.document=df.comment.replace('[^A-Za-zㄱ-ㅎㅏ-ㅣ가-힣]+','')

    return df

# result = preprocessing(gr_data)
# result = result.dropna()
# print(result)

In [37]:
# 감성분석할 comment 추출 
def export_com(preprocessed_df):
    sens =[]
    for sen in preprocessed_df.comment:
        sens.append(sen)
    print('check lenght :', len(sens), len(preprocessed_df)) # 개수 확인 
    print('sample sentence :', sens[1])
    return sens

In [38]:
def make_predicted_label(sen):
    sen = [sen]
    score = test_sentences(sen)
    result = np.argmax(score)

    if result == 0:   # negative 
        return 0
    elif result == 1: # positive
        return 1

In [39]:
def predict_label(model, df, place_name):
    result = preprocessing(df)
    result = result.dropna()
    
    sens = export_com(result)
    
    scores_data=[]
    for sen in sens:
        scores_data.append(make_predicted_label(sen))
        
    df['pred'] = scores_data    
    
    cor = df[df.label == df.pred]
    uncor = df[df.label != df.pred]
    
    print('correct prediction num :', len(cor))
    print('uncorrect prediction num :', len(uncor))
    print('correct label check :' ,set(cor.label))
    
#     df.to_csv(pj_path + f'/sentiment_data/{place_name}_pred_kcbert.csv')
    return df

In [40]:
print('### spacing ###')
predict_spac = predict_label(model, test_spac, 'total')
print('### spell ###')
predict_spel = predict_label(model, test_spel, 'total')

### spacing ###
check lenght : 8389 8389
sample sentence : 코로나 때문에 너무 오랜만에 가본 예술의 전당이였고 공연도 너무 좋았습니다 직원 분들도 너무 친절했구요 있는 공연들 최대한 아이들과 많이 가보려구요 가시는 분들 모두 좋은 시간 보내세요


  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 6708
uncorrect prediction num : 1681
correct label check : {0, 1}
### spell ###
check lenght : 8386 8386
sample sentence : 역대 엑스포 개최 도시의 기념품과 희귀한 아이템을 한곳에서 볼 수 있습니다


  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 6715
uncorrect prediction num : 1671
correct label check : {0, 1}


## Loss (RMSE)

In [41]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import math

In [42]:
def rmse(y, y_pred):
    from sklearn.metrics import mean_squared_error
    import math
    print('lenght check (origin, prediction):', len(y), len(y_pred))

    rmse_label = math.sqrt(mean_squared_error(y, y_pred))
    print('rmse of label :', rmse_label)

## Accuracy

In [43]:
def acc(y, y_pred, total):
    correct = (y_pred == y).sum().item()

    print(f'Accuracy of the network on the {total} test text: %d %%' % (
        100 * correct / total))

## f1-score

In [44]:
from sklearn.metrics import f1_score, classification_report

In [45]:
def f1(y, y_pred):
    score = f1_score(y, y_pred)
    report = classification_report(y, y_pred)
    
    print('f1 score :', score)
    print('===== classification report =====')
    print(report)

## calculate performance
- RMSE
- Accuracy
- f1-score

In [49]:
def cal_perform(df):
    y = df.label
    y_pred = df.pred
    if len(y) == len(y_pred):
        total = len(y)
        print('label length :', total)
    else:
        print('It has different length !')
    
    rmse(y, y_pred)
    acc(y, y_pred, total)
    f1(y, y_pred)

In [50]:
print('===== spacing =====')
cal_perform(predict_spac)
print('===== spell =====')
cal_perform(predict_spel)

===== spacing =====
label length : 8389
lenght check (origin, prediction): 8389 8389
rmse of label : 0.44763986853418153
Accuracy of the network on the 8389 test text: 79 %
f1 score : 0.872699734948883
===== classification report =====
              precision    recall  f1-score   support

           0       0.41      0.74      0.53      1274
           1       0.95      0.81      0.87      7115

    accuracy                           0.80      8389
   macro avg       0.68      0.78      0.70      8389
weighted avg       0.86      0.80      0.82      8389

===== spell =====
label length : 8386
lenght check (origin, prediction): 8386 8386
rmse of label : 0.44638623696243956
Accuracy of the network on the 8386 test text: 80 %
f1 score : 0.8733035105011752
===== classification report =====
              precision    recall  f1-score   support

           0       0.41      0.75      0.53      1276
           1       0.95      0.81      0.87      7110

    accuracy                          