# Predict google map review dataset 
## model
- kcbert
- fine-tuned with Naver Sentiment Movie Corpus (NSMC)
- train 4 epochs
- 0.87 accuracy

## dataset
- google map review of tourist places in Daejeon, Korea 

In [1]:
import torch
from torch import nn, Tensor
from torch.optim import Optimizer
from torch.utils.data import DataLoader, RandomSampler, DistributedSampler, random_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss
from pytorch_lightning.core.lightning import LightningModule 
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.metrics.functional import accuracy, precision, recall
from transformers import AdamW, BertForSequenceClassification, AdamW, BertConfig, AutoTokenizer, BertTokenizer, TrainingArguments
from keras.preprocessing.sequence import pad_sequences


import random
import numpy as np 

import time
import datetime
import pandas as pd
import os
from tqdm import tqdm


import pandas as pd
from transformers import AutoTokenizer, AutoModelWithLMHead
from keras.preprocessing.sequence import pad_sequences

In [2]:
if torch.cuda.is_available():    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 2070


In [3]:
pj_path = os.getenv('HOME') + '/Projects/JeongCheck'
data_path = pj_path + '/final_dataset_8420'

In [4]:
file_list = os.listdir(data_path)
print(len(file_list))
file_list

15


['google_reviews_daecheong_lake.csv',
 'google_reviews_gyejok_mountain.csv',
 'google_reviews_observatory.csv',
 'google_reviews_art_culture_complex.csv',
 'google_reviews_sungsimdang_bakery.csv',
 'google_reviews_water_barrel.csv',
 'google_reviews_yuseong_hotspring.csv',
 'google_reviews_ppuri_park.csv',
 'google_reviews_jangtae_mountain.csv',
 'google_reviews_dongchundang.csv',
 'google_reviews_hanbat_arboretum.csv',
 'google_reviews_oworld_zoo.csv',
 'google_reviews_science_museum.csv',
 'google_reviews_expo_science_park.csv',
 'google_reviews_uineungjeongi_street.csv']

## make tour places list 

In [5]:
places = []
for file in file_list:
    tmp = file.split('_')[2]
    if '.csv' in tmp:
        tmp = tmp.split('.')[0]
    places.append(tmp)

In [6]:
places

['daecheong',
 'gyejok',
 'observatory',
 'art',
 'sungsimdang',
 'water',
 'yuseong',
 'ppuri',
 'jangtae',
 'dongchundang',
 'hanbat',
 'oworld',
 'science',
 'expo',
 'uineungjeongi']

In [7]:
data_path

'/home/aiffel-dj26/Projects/JeongCheck/final_dataset_8420'

In [8]:
for place, file in zip(places, file_list):
    globals()['{}_data'.format(place)] = pd.read_csv(data_path + f'/{file}')
#     data_list.append(globals()['{}_data'.format(place)] = pd.read_csv(data_path + f'/{file}'))

In [9]:
# data_list = []
# for place in places:
#     data_list.append(f'{place}_data')

In [10]:
data_list = [daecheong_data,
  gyejok_data,
  observatory_data,
  art_data,
  sungsimdang_data,
  water_data,
  yuseong_data,
  ppuri_data,
  jangtae_data,
  dongchundang_data,
  hanbat_data,
  oworld_data,
  science_data,
  expo_data,
  uineungjeongi_data]

In [11]:
print(data_list)

[               name  ratings   date  \
0               유레카        5   5달 전   
1               이정주        5   6달 전   
2               송병석        4  10달 전   
3               배진환        5   1년 전   
4               오재열        5   1년 전   
..              ...      ...    ...   
486        Hs jeong        5   2년 전   
487             강성희        5   3년 전   
488             김용훈        5   3년 전   
489  Hyeon-Gyu Choe        4   3년 전   
490             김유리        4   3년 전   

                                               comment      search  \
0    참으로 아름다운 호수다 왜 이제서야 와 봤을까 하는 생각이 든다 개발되면 교통이 불...         대청호   
1    청남대를 품고 있는 매우 아름다운 호수 대통령 별장이 자리 잡을 정도로 경치와 장소...         대청호   
2                                             사진 찍기 명소         대청호   
3                                        날씨가 너무 좋고 예뻤다         대청호   
4                     사계절 항상 아름답고 접근성도 좋아 방문자가 많은 곳입니다         대청호   
..                                                 ...         ...   
486                             경치도 좋고 바

In [12]:
data_list[0]

Unnamed: 0,name,ratings,date,comment,search,keyword,label
0,유레카,5,5달 전,참으로 아름다운 호수다 왜 이제서야 와 봤을까 하는 생각이 든다 개발되면 교통이 불...,대청호,daecheong_lake,1
1,이정주,5,6달 전,청남대를 품고 있는 매우 아름다운 호수 대통령 별장이 자리 잡을 정도로 경치와 장소...,대청호,daecheong_lake,1
2,송병석,4,10달 전,사진 찍기 명소,대청호,daecheong_lake,1
3,배진환,5,1년 전,날씨가 너무 좋고 예뻤다,대청호,daecheong_lake,1
4,오재열,5,1년 전,사계절 항상 아름답고 접근성도 좋아 방문자가 많은 곳입니다,대청호,daecheong_lake,1
...,...,...,...,...,...,...,...
486,Hs jeong,5,2년 전,경치도 좋고 바람도 시원하고 아주 좋아요,대청호반자연생태공원,daecheong_lake,1
487,강성희,5,3년 전,데이트 장소로 딱 좋아요 밤엔 이쁜 조명까지 가족들과 연인들과 나들이 가기 좋은 장소,대청호반자연생태공원,daecheong_lake,1
488,김용훈,5,3년 전,조망권 굿,대청호반자연생태공원,daecheong_lake,1
489,Hyeon-Gyu Choe,4,3년 전,가볍게 산책하기 아주 좋아요,대청호반자연생태공원,daecheong_lake,1


In [13]:
def check_make_df(place, data):
    print(f'##### {place} #####')
    print(data.head)
    print(f'{place} len :', len(data))
    
    if any(data.isna().sum()) == True:
        test_set.dropna(inplace=True)
    
    print('nan values check :', any(data.isna().sum()))
    
    neutral_portion = len(data[data.label==2]) / len(data)
    print('neutral label portion :', neutral_portion)
    
    new_data = data[data.label != 2]
    data = new_data
    print('final length of data :', len(new_data))
    return data

In [14]:
new_data_list = []
for place, data in zip(places, data_list):
    data = check_make_df(place, data)
    new_data_list.append(data)

##### daecheong #####
<bound method NDFrame.head of                name  ratings   date  \
0               유레카        5   5달 전   
1               이정주        5   6달 전   
2               송병석        4  10달 전   
3               배진환        5   1년 전   
4               오재열        5   1년 전   
..              ...      ...    ...   
486        Hs jeong        5   2년 전   
487             강성희        5   3년 전   
488             김용훈        5   3년 전   
489  Hyeon-Gyu Choe        4   3년 전   
490             김유리        4   3년 전   

                                               comment      search  \
0    참으로 아름다운 호수다 왜 이제서야 와 봤을까 하는 생각이 든다 개발되면 교통이 불...         대청호   
1    청남대를 품고 있는 매우 아름다운 호수 대통령 별장이 자리 잡을 정도로 경치와 장소...         대청호   
2                                             사진 찍기 명소         대청호   
3                                        날씨가 너무 좋고 예뻤다         대청호   
4                     사계절 항상 아름답고 접근성도 좋아 방문자가 많은 곳입니다         대청호   
..                                                 ...     

[1863 rows x 7 columns]>
sungsimdang len : 1863
nan values check : False
neutral label portion : 0.0021470746108427268
final length of data : 1859
##### water #####
<bound method NDFrame.head of                 name  ratings   date  \
0                김성주        5  9시간 전   
1                서알콩        5   1일 전   
2                윤설희        5   2일 전   
3                이상훈        4   2일 전   
4              송순이공주        5   2일 전   
..               ...      ...    ...   
680              김태규        4   4년 전   
681              김뚱이        5   4년 전   
682              이시형        5   4년 전   
683     Park Sangbae        4   4년 전   
684  hyun seung shin        3   4년 전   

                                               comment        search  \
0                                가편하게 산책과 등산을 할 수 있는 곳  계룡산국립공원수통골지구   
1                                    커피 한잔하기 딱 좋은 거리에요  계룡산국립공원수통골지구   
2                                           비 오고 난 수통골  계룡산국립공원수통골지구   
3                   가족 나들이하기 정말 좋은 곳

In [15]:
for place, origin, new in zip(places, data_list, new_data_list):
    print(f'##### {place} #####')
    print(f'{place} before :', len(origin))    
    print(f'{place} after :', len(new))

##### daecheong #####
daecheong before : 491
daecheong after : 489
##### gyejok #####
gyejok before : 483
gyejok after : 471
##### observatory #####
observatory before : 26
observatory after : 26
##### art #####
art before : 823
art after : 823
##### sungsimdang #####
sungsimdang before : 1863
sungsimdang after : 1859
##### water #####
water before : 685
water after : 685
##### yuseong #####
yuseong before : 431
yuseong after : 429
##### ppuri #####
ppuri before : 426
ppuri after : 426
##### jangtae #####
jangtae before : 518
jangtae after : 510
##### dongchundang #####
dongchundang before : 329
dongchundang after : 327
##### hanbat #####
hanbat before : 402
hanbat after : 402
##### oworld #####
oworld before : 446
oworld after : 444
##### science #####
science before : 381
science after : 381
##### expo #####
expo before : 783
expo after : 780
##### uineungjeongi #####
uineungjeongi before : 333
uineungjeongi after : 333


In [16]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")

In [17]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    pj_path + "/checkpoint-1500",
    num_labels = 2, 
                    
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

In [18]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30000, 768)
bert.embeddings.position_embeddings.weight                (300, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [19]:
device = "cuda:0"
model = model.to(device)

In [20]:
def convert_input_data(sentences):

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    MAX_LEN = 64

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
        
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [21]:
def test_sentences(sentences):
 
    # 평가모드로 변경!!!!!
    model.eval()

    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

## 데이터 변환

In [22]:
def preprocessing(df):

    df.document=df.comment.replace('[^A-Za-zㄱ-ㅎㅏ-ㅣ가-힣]+','')

    return df

# result = preprocessing(gr_data)
# result = result.dropna()
# print(result)

In [23]:
# 감성분석할 comment 추출 
def export_com(preprocessed_df):
    sens =[]
    for sen in preprocessed_df.comment:
        sens.append(sen)
    print('check lenght :', len(sens), len(preprocessed_df)) # 개수 확인 
    print('sample sentence :', sens[1])
    return sens

label : neg(0) p(1) neut(2)
pred : idx 0, 1, 2 -> 

In [24]:
# sens = export_com(result)

In [25]:
def make_predicted_label(sen):
    sen = [sen]
    score = test_sentences(sen)
    result = np.argmax(score)

    if result == 0:   # negative 
        return 0
    elif result == 1: # positive
        return 1

# scores_data=[]
# for sen in sens:
#     scores_data.append(make_predicted_label(sen))

In [26]:
# gr_data['pred'] = scores_data

In [27]:
# gr_data.head()

In [28]:
# gr_data[gr_data.label != gr_data.pred]

In [29]:
# set(gr_data[gr_data.label != gr_data.pred].pred)

In [30]:
# set(gr_data[gr_data.label != gr_data.pred].label)

In [31]:
# gr_data[gr_data.label == gr_data.pred]

In [32]:
# cor = gr_data[gr_data.label == gr_data.pred]
# cor[cor.label==1]

In [33]:
# set(cor.label)

In [34]:
# gr_data.to_csv(pj_path + '/sentiment_data/google_pred_fine.csv')

In [35]:
pj_path

'/home/aiffel-dj26/Projects/JeongCheck'

In [36]:
def predict_label(model, df, place_name):
    result = preprocessing(df)
    result = result.dropna()
    
    sens = export_com(result)
    
    scores_data=[]
    for sen in sens:
        scores_data.append(make_predicted_label(sen))
        
    df['pred'] = scores_data    
    
    cor = df[df.label == df.pred]
    uncor = df[df.label != df.pred]
    
    print('correct prediction num :', len(cor))
    print('uncorrect prediction num :', len(uncor))
    print('correct label check :' ,set(cor.label))
    
    df.to_csv(pj_path + f'/prediction_data_nsmc/{place_name}_pred.csv')
    return df

In [37]:
def rmse(y, y_pred):
    from sklearn.metrics import mean_squared_error
    import math
    
    rmse_label = math.sqrt(mean_squared_error(y, y_pred))
    print('rmse of label :', rmse_label)

In [38]:
def acc(y, y_pred, total):
    correct = (y_pred == y).sum().item()

    print(f'Accuracy of the network on the {total} test text: %d %%' % (
        100 * correct / total))

In [39]:
def cal_perform(df):
    y = df.label
    y_pred = df.pred
    print('##### label lenght check #####')
    if len(y) == len(y_pred):
        total = len(y)
        print('same length')
        print(total)
    else:
        print('different length')
    
    rmse(y, y_pred)
    acc(y, y_pred, total)

In [40]:
for place, data in zip(places, new_data_list):
    print(f'##### {place} predict .... #####')
    data = predict_label(model, data, place)
    cal_perform(data)

##### daecheong predict .... #####
check lenght : 489 489
sample sentence : 청남대를 품고 있는 매우 아름다운 호수 대통령 별장이 자리 잡을 정도로 경치와 장소 등 모든 면에서 최상이라고 할 수 있는 곳이다 특히 다목적댐의 특성상 지역의 농 공업 등 각 산업분야의 용수 공급에 있어서도 중요한 역할을 담당하고 있는 호수다


  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 375
uncorrect prediction num : 114
correct label check : {0, 1}
##### label lenght check #####
same length
489
rmse of label : 0.4828341685877546
Accuracy of the network on the 489 test text: 76 %
##### gyejok predict .... #####
check lenght : 471 471
sample sentence : 황톳길 맨발로 걷는 산 정말 좋은 곳입니다


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 393
uncorrect prediction num : 78
correct label check : {0, 1}
##### label lenght check #####
same length
471
rmse of label : 0.40694605974428755
Accuracy of the network on the 471 test text: 83 %
##### observatory predict .... #####
check lenght : 26 26
sample sentence : 달 표면이 끝내줘요
correct prediction num : 19
uncorrect prediction num : 7
correct label check : {1}
##### label lenght check #####
same length
26
rmse of label : 0.5188745216627708
Accuracy of the network on the 26 test text: 73 %
##### art predict .... #####
check lenght : 823 823
sample sentence : 코로나 때문에 너무 오랜만에 가본 예술의 전당이었고 공연도 너무 좋았습니다 직원분들도 너무 친절했고요 있는 공연들 최대한 아이들과 많이 가보려고요 가시는 분들 모두 좋은 시간 보내세요


  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 654
uncorrect prediction num : 169
correct label check : {0, 1}
##### label lenght check #####
same length
823
rmse of label : 0.45315151334423737
Accuracy of the network on the 823 test text: 79 %
##### sungsimdang predict .... #####
check lenght : 1859 1859
sample sentence : 자개 테이블에게 먹는 팥빙수의 맛


  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 1439
uncorrect prediction num : 420
correct label check : {0, 1}
##### label lenght check #####
same length
1859
rmse of label : 0.4753187543487113
Accuracy of the network on the 1859 test text: 77 %
##### water predict .... #####
check lenght : 685 685
sample sentence : 커피 한잔하기 딱 좋은 거리에요
correct prediction num : 549
uncorrect prediction num : 136
correct label check : {0, 1}
##### label lenght check #####
same length
685
rmse of label : 0.4455784397672327
Accuracy of the network on the 685 test text: 80 %
##### yuseong predict .... #####
check lenght : 429 429
sample sentence : 꽃이 약간 시든 게 아쉬워서요


  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 346
uncorrect prediction num : 83
correct label check : {0, 1}
##### label lenght check #####
same length
429
rmse of label : 0.43985587807052606
Accuracy of the network on the 429 test text: 80 %
##### ppuri predict .... #####
check lenght : 426 426
sample sentence : 공기도 좋고 우리 뿌리를 보고 좋은 곳입니다
correct prediction num : 365
uncorrect prediction num : 61
correct label check : {0, 1}
##### label lenght check #####
same length
426
rmse of label : 0.3784078332472926
Accuracy of the network on the 426 test text: 85 %
##### jangtae predict .... #####
check lenght : 510 510
sample sentence : 하늘보고 누워 힐링하기 너무 좋은 곳이에요


  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 445
uncorrect prediction num : 65
correct label check : {0, 1}
##### label lenght check #####
same length
510
rmse of label : 0.3570027736477083
Accuracy of the network on the 510 test text: 87 %
##### dongchundang predict .... #####
check lenght : 327 327
sample sentence : 깨끗하고 산책하기 참 좋네요 날 좋은 오늘 같은 날은 더더욱


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 274
uncorrect prediction num : 53
correct label check : {0, 1}
##### label lenght check #####
same length
327
rmse of label : 0.40259099679869137
Accuracy of the network on the 327 test text: 83 %
##### hanbat predict .... #####
check lenght : 402 402
sample sentence : 아이랑 자전거 타고 놀기도 참 좋은 거 같아요
correct prediction num : 329
uncorrect prediction num : 73
correct label check : {0, 1}
##### label lenght check #####
same length
402
rmse of label : 0.426136175184641
Accuracy of the network on the 402 test text: 81 %
##### oworld predict .... #####
check lenght : 444 444
sample sentence : 기대를 하지 않고 가면 충분히 즐길만합니다


  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 359
uncorrect prediction num : 85
correct label check : {0, 1}
##### label lenght check #####
same length
444
rmse of label : 0.4375402169417589
Accuracy of the network on the 444 test text: 80 %
##### science predict .... #####
check lenght : 381 381
sample sentence : 정부에서 만들었는데 세금 낭비인 거 같아요
correct prediction num : 304
uncorrect prediction num : 77
correct label check : {0, 1}
##### label lenght check #####
same length
381
rmse of label : 0.44955504394101553
Accuracy of the network on the 381 test text: 79 %
##### expo predict .... #####
check lenght : 780 780
sample sentence : 역대 엑스포 개최 도시의 기념품과 희귀한 아이템을 한곳에서 볼 수 있습니다


  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  This is separate from the ipykernel package so we can avoid doing imports until


correct prediction num : 614
uncorrect prediction num : 166
correct label check : {0, 1}
##### label lenght check #####
same length
780
rmse of label : 0.461324736840019
Accuracy of the network on the 780 test text: 78 %
##### uineungjeongi predict .... #####
check lenght : 333 333
sample sentence : 대전 관광코스 스카이 로드가 유명하다 길이 214 높이 20 인 긴 천장이다 단순한 천장이 아니라 스크린 시설이라 대형 화면이라 생각하면 된다 으능정이 문화거리가 있는 중앙로는 오랫동안 대전 핵심 상권이었다 하지만 1990년대 공공기관과 금융기관이 둔산 신도시로 대거 이동하면서 활기를 많이 잃는다 이후 분위기를 바꾸기 위해 만든 게 바로 스카이로 드 2013년에 설치했다 밤에 보면 훨씬 화려한 풍경을 볼 수
correct prediction num : 252
uncorrect prediction num : 81
correct label check : {0, 1}
##### label lenght check #####
same length
333
rmse of label : 0.4931969619160719
Accuracy of the network on the 333 test text: 75 %


## Loss (RMSE)

In [93]:
y = gr_data.label
y_pred = gr_data.pred
print(len(y), len(y_pred))

8389 8389


In [94]:
type(y), type(y_pred)

(pandas.core.series.Series, pandas.core.series.Series)

In [95]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import math

scaler = MinMaxScaler()

In [96]:
rmse_label = math.sqrt(mean_squared_error(y, y_pred))

In [97]:
print('rmse of label :', rmse_label)

rmse of label : 0.44763986853418153


In [98]:
def rmse(df):
    y = test_set.label
    y_pred = test_set.pred
    print(len(y), len(y_pred))

    rmse_label = math.sqrt(mean_squared_error(y, y_pred))
    print('rmse of label :', rmse_label)

## Accuracy

In [99]:
if len(y) == len(y_pred):
    total = len(y)
    print(total)
else:
    print('different length')

8389


In [100]:
correct = (y_pred == y).sum().item()

print(f'Accuracy of the network on the {total} test text: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 8389 test text: 79 %


In [58]:
def acc(df):
    correct = (y_pred == y).sum().item()

    print(f'Accuracy of the network on the {total} test text: %d %%' % (
        100 * correct / total))

# 여행지별 함수

In [None]:
df_list = # 여행지별 파일 리스트

In [88]:
def predict_perform(df):
    new_df = predict_label(model, df, place_name)
    cal_perform(df)

In [94]:
predict_perform(gr_data, model, )

NameError: name 'place_name' is not defined

In [None]:
for df in df_list:
    new_df = predict_label(model, df, place_name)
    cal_perform(df)

## F-1 score

In [63]:
from sklearn.metrics import f1_score

f1_score(y, y_pred)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [68]:
def f1_loss(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:
    '''Calculate F1 score. Can work with gpu tensors
    
    The original implmentation is written by Michal Haltuf on Kaggle.
    
    Returns
    -------
    torch.Tensor
        `ndim` == 1. 0 <= val <= 1
    
    Reference
    ---------
    - https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
    - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    - https://discuss.pytorch.org/t/calculating-precision-recall-and-f1-score-in-case-of-multi-label-classification/28265/6
    
    '''
    assert y_true.ndim == 1
    assert y_pred.ndim == 1 or y_pred.ndim == 2
    
    if y_pred.ndim == 2:
        y_pred = y_pred.argmax(dim=1)
        
    
    tp = (y_true * y_pred).sum().to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
    fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum().to(torch.float32)
    
    epsilon = 1e-7
    
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    
    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    f1.requires_grad = is_training
    return f1

In [69]:
f1_score = f1_loss(y.torch, y_pred)

AttributeError: 'numpy.int64' object has no attribute 'to'