# Predict google map review dataset 
## model
- kcbert
- fine-tuned with naver shopping review dataset (200,000개)
- train 5 epochs
- 0.97 accuracy

## dataset
- google map review of tourist places in Daejeon, Korea 

In [48]:
import torch
from torch import nn, Tensor
from torch.optim import Optimizer
from torch.utils.data import DataLoader, RandomSampler, DistributedSampler, random_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss
from pytorch_lightning.core.lightning import LightningModule 
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.metrics.functional import accuracy, precision, recall
from transformers import AdamW, BertForSequenceClassification, AdamW, BertConfig, AutoTokenizer, BertTokenizer, TrainingArguments
from keras.preprocessing.sequence import pad_sequences


import random
import numpy as np 

import time
import datetime
import pandas as pd
import os
from tqdm import tqdm


import pandas as pd
from transformers import AutoTokenizer, AutoModelWithLMHead
from keras.preprocessing.sequence import pad_sequences

In [49]:
if torch.cuda.is_available():    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 2070


In [50]:
pj_path = os.getenv('HOME') + '/Projects/JeongCheck'

In [51]:
test_set = pd.read_csv(pj_path + '/crawling_data/google_reviews.csv', index_col=0)
print(len(test_set))
test_set.head()

8408


Unnamed: 0_level_0,ratings,date,label,comment,search,keyword
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
미샤,4.0,2주 전,1.0,여기는 갈 때마다 새로운 느낌이 야 즐거운 시간이 되었습니다,장태산자연휴양림메타세콰이어산림욕장,jangtaesan
Gyang gree young,5.0,3주 전,1.0,하늘 보고 누워 힐링하기 너무 좋은 곳이에요,장태산자연휴양림메타세콰이어산림욕장,jangtaesan
박진수,5.0,3달 전,1.0,잘 정리된 메타세쿼이아 숲에서 산림욕하기 좋은 곳으로 둘레길 걷기와 전망대까지 가벼...,장태산자연휴양림메타세콰이어산림욕장,jangtaesan
Sh Choi,5.0,7달 전,1.0,산책하기 너무 좋은 명소,장태산자연휴양림메타세콰이어산림욕장,jangtaesan
박은선,5.0,1년 전,1.0,주말 마자 애정 하는 장태산 나들이 단풍도 예뻐요,장태산자연휴양림메타세콰이어산림욕장,jangtaesan


In [52]:
def check_make_df(place, data):
    print(f'##### {place} #####')
    print(data.head)
    print(f'{place} len :', len(data))
    
    if any(data.isna().sum()) == True:
        test_set.dropna(inplace=True)
    
    print('nan values check :', any(data.isna().sum()))
    
    neutral_portion = len(data[data.label==2]) / len(data)
    print('neutral label portion :', neutral_portion)
    
    new_data = data[data.label != 2]
    data = new_data
    print('final length of data :', len(new_data))
    return data

In [53]:
test_set = check_make_df('total', test_set)

##### total #####
<bound method NDFrame.head of                   ratings  date  label  \
name                                     
미샤                    4.0  2주 전    1.0   
Gyang gree young      5.0  3주 전    1.0   
박진수                   5.0  3달 전    1.0   
Sh Choi               5.0  7달 전    1.0   
박은선                   5.0  1년 전    1.0   
...                   ...   ...    ...   
NaN                   NaN   NaN    NaN   
NaN                   NaN   NaN    NaN   
NaN                   NaN   NaN    NaN   
NaN                   NaN   NaN    NaN   
NaN                   NaN   NaN    NaN   

                                                            comment  \
name                                                                  
미샤                                여기는 갈 때마다 새로운 느낌이 야 즐거운 시간이 되었습니다   
Gyang gree young                           하늘 보고 누워 힐링하기 너무 좋은 곳이에요   
박진수               잘 정리된 메타세쿼이아 숲에서 산림욕하기 좋은 곳으로 둘레길 걷기와 전망대까지 가벼...   
Sh Choi                                           

In [54]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")

In [55]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model_kc = BertForSequenceClassification.from_pretrained(
    "beomi/kcbert-base",
    num_labels = 2, 
                    
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model_nsmc = BertForSequenceClassification.from_pretrained(
    pj_path + "/checkpoint-1500",
    num_labels = 2, 
                    
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model_sent = BertForSequenceClassification.from_pretrained(
    pj_path + "/bert_model/checkpoint-2000",
    num_labels = 2, 
                    
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

In [56]:
def param_check(model):
    params = list(model.named_parameters())

    print('The BERT model has {:} different named parameters.\n'.format(len(params)))

    print('==== Embedding Layer ====\n')

    for p in params[0:5]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

    print('\n==== First Transformer ====\n')

    for p in params[5:21]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

    print('\n==== Output Layer ====\n')

    for p in params[-4:]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [57]:
param_check(model_kc)
param_check(model_nsmc)
param_check(model_sent)

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30000, 768)
bert.embeddings.position_embeddings.weight                (300, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [58]:
device = "cuda:0"
model_kc = model_kc.to(device)
model_nsmc = model_nsmc.to(device)
model_sent = model_sent.to(device)

In [59]:
def convert_input_data(sentences):

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    MAX_LEN = 64

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
        
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [60]:
def test_sentences(sentences, model):
 
    # 평가모드로 변경!!!!!
    model.eval()

    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

## 데이터 변환

In [61]:
def preprocessing(df):

    df.document=df.comment.replace('[^A-Za-zㄱ-ㅎㅏ-ㅣ가-힣]+','')

    return df

# result = preprocessing(gr_data)
# result = result.dropna()
# print(result)

In [90]:
# 감성분석할 comment 추출 
def export_com(preprocessed_df):
    sens =[]
    for sen in preprocessed_df.comment:
        sens.append(sen)
    print('check length :', len(sens), len(preprocessed_df)) # 개수 확인 
    print('sample sentence :', sens[1])
    return sens

label : neg(0) p(1) neut(2)
pred : idx 0, 1, 2 -> 

In [92]:
def make_predicted_label(sen, model):
    sen = [sen]
    score = test_sentences(sen, model)
    result = np.argmax(score)

    if result == 0:   # negative 
        return 0
    elif result == 1: # positive
        return 1

In [93]:
def predict_label(model, df, place_name):
    result = preprocessing(df)
    result = result.dropna()
    
    sens = export_com(result)
    
    scores_data=[]
    for sen in sens:
        scores_data.append(make_predicted_label(sen, model))
        
    df['pred'] = scores_data    
    
    cor = df[df.label == df.pred]
    uncor = df[df.label != df.pred]
    
    print('correct prediction num :', len(cor))
    print('uncorrect prediction num :', len(uncor))
    print('correct label check :' ,set(cor.label))
    
    df.to_csv(pj_path + f'/prediction_data/{place_name}_pred.csv')
    return df

## Loss (RMSE)

In [94]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import math

In [95]:
def rmse(y, y_pred):
    from sklearn.metrics import mean_squared_error
    import math
    
    rmse_label = math.sqrt(mean_squared_error(y, y_pred))
    print('rmse of label :', rmse_label)

## Accuracy

In [96]:
def acc(y, y_pred, total):
    correct = (y_pred == y).sum().item()

    print(f'Accuracy of the network on the {total} test text: %d %%' % (
        100 * correct / total))

## F-1 score

In [97]:
from sklearn.metrics import f1_score, classification_report

In [98]:
def f1(y, y_pred):
    score = f1_score(y, y_pred)
    report = classification_report(y, y_pred)
    
    print('## f1-score ##')
    print('f1 score :', score)
    print('## classification report ##')
    print(report)

## calculate performance
- RMSE
- Accuracy
- f1-score

In [99]:
def cal_perform(df):
    y = df.label
    y_pred = df.pred
    if len(y) == len(y_pred):
        total = len(y)
        print('same length')
        print('label length :', total)
    else:
        print('different length')
    
    rmse(y, y_pred)
    acc(y, y_pred, total)
    f1(y, y_pred)

## 모델 별 성능 지표 계산

In [100]:
model_list = [model_kc, model_nsmc, model_sent]
model_list_idx = [[idx, model] for idx, model in enumerate(model_list)]
model_name = ['model_kc', 'model_nsmc', 'model_sent']

In [101]:
for model, name in zip(model_list_idx, model_name):
    print(f'===== {name} predict .... =====')
    data = predict_label(model[-1], test_set, model[0])
    cal_perform(data)

===== model_kc predict .... =====
check length : 6188 6188
sample sentence : 하늘 보고 누워 힐링하기 너무 좋은 곳이에요
correct prediction num : 805
uncorrect prediction num : 5383
correct label check : {0.0, 1.0}
same length
label length : 6188
rmse of label : 0.9326893921678553
Accuracy of the network on the 6188 test text: 13 %
## f1-score ##
f1 score : 0.009567617295308188
## classification report ##
              precision    recall  f1-score   support

         0.0       0.13      0.99      0.22       785
         1.0       0.81      0.00      0.01      5403

    accuracy                           0.13      6188
   macro avg       0.47      0.50      0.12      6188
weighted avg       0.73      0.13      0.04      6188

===== model_nsmc predict .... =====
check length : 6188 6188
sample sentence : 하늘 보고 누워 힐링하기 너무 좋은 곳이에요
correct prediction num : 5001
uncorrect prediction num : 1187
correct label check : {0.0, 1.0}
same length
label length : 6188
rmse of label : 0.43797589317147767
Accuracy of the 