# **네이버 쇼핑 리뷰 감성 분류하기(ratings_total.txt 데이터 이용)**
### 단, 모델은 GRU를 사용, 파이토치를 이용

In [1]:
!pip install torch torchvision torchaudio
!pip install transformers

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

# **아래 코드는 GRU(Gated Recurrent Unit)를 사용하여 데이터 분석**

In [2]:
import urllib.request
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt"
filename = "ratings_total.txt"
urllib.request.urlretrieve(url, filename)
df = pd.read_csv(filename, sep='\t', names=['rating', 'review'])

print(df.head())


   rating                                             review
0       5                                            배공빠르고 굿
1       2                      택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고
2       5  아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...
3       2  선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...
4       5                  민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ


In [3]:
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 리뷰에서 불필요한 문자 제거
def clean_text(text):
    text = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", text)
    return text

df['review'] = df['review'].apply(clean_text)

# 레이블 인코딩
df['rating'] = df['rating'].apply(lambda x: 1 if x > 3 else 0)

# 데이터 분할
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(train_df.head())
print(test_df.head())


        rating                                             review
153248       1  이틀정도 설겆이하고 씽크대 청소하고 버립니다 수세미 세균걱정도 덜고 주방도 깔끔해지...
67802        1                             여러모로 사용하기 편하고 튼튼하고 좋아요
148889       0                                 얇디얇아요 꼭 두장 겹쳐야함 ㅋㅋ
103093       1  액정필름 두껍고 튼튼해요 풀커버는 터치가 잘 안돼서 불편할까봐 걱정했는데 터치도 잘...
104681       0                             넘 기대했나봐요 첨 써서 그런지 어색해요
        rating                                             review
119737       0  마감이 안좋아요실밥도 많고 바느질도 부족한 부분이 몇군데 있네요교환받기 귀찮아서 그...
72272        1                              깨끗하게 잘 다듬어져 있어요 맛도좋고요
158154       1                     재구매 배송빨라요 길냥이들이 잘먹어요 대용량이라 좋네요
65426        1                         제품도 빨리 배송해주시고 꼼꼼하게 잘챙겨주셨어요
30074        1                           기타 남 멋지고 예뻐요 여러 사은품도 좋아요


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

class NaverShoppingDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        review = self.df.iloc[index, 1]
        label = self.df.iloc[index, 0]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

max_length = 128
train_dataset = NaverShoppingDataset(train_df, tokenizer, max_length)
test_dataset = NaverShoppingDataset(test_df, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [5]:
import torch.nn as nn

class GRUClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(GRUClassifier, self).__init__()

        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.gru = nn.GRU(hidden_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        packed_output, hidden = self.gru(embedded)

        if self.gru.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])

        output = self.fc(hidden)
        return output

input_dim = len(tokenizer.vocab)
hidden_dim = 128
output_dim = 2
n_layers = 2
bidirectional = True
dropout = 0.3

model = GRUClassifier(input_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)


In [6]:
from transformers import AdamW
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = CrossEntropyLoss().to(device)

def train_epoch(model, data_loader, criterion, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = criterion(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

EPOCHS = 3

for epoch in range(EPOCHS):
    train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train loss {train_loss} accuracy {train_acc}')




Epoch 1/3
Train loss 0.5308127262726426 accuracy 0.7243
Epoch 2/3
Train loss 0.40157033941820264 accuracy 0.82845
Epoch 3/3
Train loss 0.36354908915385603 accuracy 0.8490562500000001


In [7]:
def eval_model(model, data_loader, criterion, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = criterion(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

test_acc, test_loss = eval_model(model, test_loader, criterion, device)
print(f'Test loss {test_loss} accuracy {test_acc}')


Test loss 0.34521858283281326 accuracy 0.8598250000000001
