In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pip install konlpy

Note: you may need to restart the kernel to use updated packages.


In [3]:
from konlpy.tag import Okt

In [4]:
okt = Okt()

In [5]:
okt.morphs("아버지가방에들어가신다")

['아버지', '가방', '에', '들어가신다']

In [6]:
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x78f1d00f5220>)

In [7]:
import pandas as pd
df = pd.read_csv("/kaggle/working/ratings_train.txt", encoding='utf-8', sep='\t')

In [8]:
df

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [9]:
df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [10]:
df.label.value_counts()

label
0    75173
1    74827
Name: count, dtype: int64

In [11]:
df.drop_duplicates(subset=['document']).shape

(146183, 3)

In [12]:
df.drop_duplicates(subset=['document'], inplace = True)

In [13]:
STOPWORDS = set(['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다'])

In [14]:
import re

In [15]:
re.findall("[^가-힣\\s]","가나다 123 ㅋㅋㅋ")

['1', '2', '3', 'ㅋ', 'ㅋ', 'ㅋ']

In [16]:
re.sub("[^가-힣\\s]",'',"가나다 123 ㅋㅋㅋ")

'가나다  '

In [17]:
def preprocessing_text(text):
    if not  isinstance(text, str):
        return []
    
    text = re.sub("[^가-힣\\s]",'', text )

    # 어간 추출
    tokens = okt.morphs(text, stem=True)
    tokens = [word for word in tokens if word not in STOPWORDS]

    return tokens


In [18]:
df['preprocessing'] = df.document.apply(preprocessing_text )

In [19]:
df.head(3)

Unnamed: 0,id,document,label,preprocessing
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0,"[아, 더빙, 진짜, 짜증나다, 목소리]"
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1,"[흠, 포스터, 보고, 초딩, 영화, 줄, 오버, 연기, 조차, 가볍다, 않다]"
2,10265843,너무재밓었다그래서보는것을추천한다,0,"[너, 무재, 밓었, 다그, 래서, 보다, 추천, 다]"


In [22]:
from collections import Counter
counter = Counter()
for token in df.preprocessing:
    counter.update(token)

In [23]:
max_vocab_size = len(counter)

In [24]:
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, _ in counter.most_common(max_vocab_size - 2):
        vocab[word] = len(vocab)


In [27]:
max_vocab_size

43070

In [26]:
len(vocab)

43070

In [28]:
def build_vocab(tokenized_data, max_vocab_size):
    counter = Counter()
    for tokens in tokenized_data:
        counter.update(tokens)
        
    # <PAD>는 시퀀스 길이 맞춤용, <UNK>는 사전에 없는 단어용
    vocab = {'<PAD>': 0, '<UNK>': 1}
    # 빈도수 상위 단어들을 사전에 추가
    for word, _ in counter.most_common(max_vocab_size - 2):
        vocab[word] = len(vocab)
        
    return vocab

In [29]:
def encode_text(tokens, vocab, max_seq_len):
    encoded = [vocab.get(word, vocab['<UNK>']) for word in tokens]
    
    # 패딩(Padding) 및 자르기(Truncation)
    if len(encoded) < max_seq_len:
        encoded += [vocab['<PAD>']] * (max_seq_len - len(encoded))
    else:
        encoded = encoded[:max_seq_len]
        
    return encoded


In [49]:
from  torch.utils.data import Dataset, DataLoader
MAX_SEQ_LEN = 30
VOCAB_SIZE = 20000
class SentimentDataset(Dataset):
    def __init__(self, filepath, vocab=None, is_train=True):
        df = pd.read_csv(filepath, sep='\t')
        df = df.dropna(subset=['document'])
        df = df.drop_duplicates(subset=['document'])

        df['tokens'] = df['document'].apply(preprocessing_text)
        
        df = df[df['tokens'].map(len) > 0]

        if is_train:
            self.vocab = build_vocab(df['tokens'], VOCAB_SIZE)
        else:
            self.vocab = vocab
        self.inputs = [encode_text(tokens, self.vocab, MAX_SEQ_LEN) for tokens in df['tokens']]
        self.labels = df['label'].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = torch.tensor(self.inputs[idx], dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.float)
        return x, y


In [50]:
import torch.nn as nn
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        last_hidden = hidden[-1]
        # lstm_out[:, -1, :]
        output = self.fc(last_hidden) # (batch_size, 1)
        return self.sigmoid(output).squeeze()

In [51]:
train_dataset = SentimentDataset("/kaggle/working/ratings_train.txt")
vocab = train_dataset.vocab

In [52]:
test_dataset = SentimentDataset("/kaggle/working/ratings_test.txt", vocab = vocab, is_train=False)

In [53]:
BATCH_SIZE = 64
LR = 0.001
EPOCHS = 5
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [54]:
import torch
import torch.optim as optim
EMBED_DIM = 128
HIDDEN_DIM = 256
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentLSTM(len(vocab), EMBED_DIM, HIDDEN_DIM).to(DEVICE)
criterion = nn.BCELoss() # 이진 교차 엔트로피 손실 함수
optimizer = optim.Adam(model.parameters(), lr=LR)

In [55]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        # 0.5 이상이면 긍정(1), 미만이면 부정(0)으로 예측
        predicted = (outputs >= 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        
    train_acc = correct / total
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}, Accuracy: {train_acc:.4f}")

Epoch 1/5 - Loss: 0.4596, Accuracy: 0.7753
Epoch 2/5 - Loss: 0.3358, Accuracy: 0.8551
Epoch 3/5 - Loss: 0.2802, Accuracy: 0.8830
Epoch 4/5 - Loss: 0.2305, Accuracy: 0.9063
Epoch 5/5 - Loss: 0.1846, Accuracy: 0.9266


In [56]:
def predict_sentiment(model, vocab, sentence):
    model.eval()
    tokens = preprocessing_text(sentence)
    encoded_ids = encode_text(tokens, vocab, MAX_SEQ_LEN)
    input_tensor = torch.tensor([encoded_ids], dtype=torch.long).to(DEVICE)
    with torch.no_grad():
        output = model(input_tensor)
        probability = output.item()
    label = "긍정" if probability >= 0.5 else "부정"
    print(f"입력 문장: \"{sentence}\"")
    print(f"예측 결과: {label} (확률: {probability*100:.2f}%)\n")

In [57]:
predict_sentiment(model, vocab, "사랑해")

입력 문장: "사랑해"
예측 결과: 긍정 (확률: 89.86%)

