In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
import re


In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(df[['review', 'sentiment']].head())

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


Text → tokenizer → vocab lookup → embedding → LSTM → hidden layer → linear → softmax → Classify

### Text Processing

In [3]:
# Tokenizer
def simple_tokenizer(text):
    text = re.sub(r'<.*?>', '', text) # remove html mark
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)

    return tokens

In [4]:

# Vocab : Dict 
texts = df['review'].tolist()
from collections import Counter

def build_vocab(texts):
    counter = Counter()
    for text in texts:
        tokens = simple_tokenizer(text)
        counter.update(tokens)

    max_vocab_size = 10000
    most_common = counter.most_common(max_vocab_size - 2)  # minus PAD、UNK

    # 建立 vocab dict
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for idx, (word, freq) in enumerate(most_common, start=2):
        vocab[word] = idx

    return vocab

def tokens_to_ids(tokens, vocab):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]


# [('<PAD>', 0), ('<UNK>', 1), ('the', 2), ('and', 3), ('a', 4), ('of', 5), ('to', 6), ('is', 7), ('it', 8), ('in', 9)]

In [5]:
from torch.utils.data import Dataset


# Text to Tokenizer & Vocab
class IMDBDataset(Dataset):
    def __init__(self, data, vocab, tokenizer, max_len=100):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['review']
        label = self.data.iloc[idx]['sentiment']

        tokens = self.tokenizer(text)  
        token_ids = tokens_to_ids(tokens, self.vocab)

        # 補齊 padding
        if len(token_ids) < self.max_len:
            token_ids += [self.vocab['<PAD>']] * (self.max_len - len(token_ids))
        else:
            token_ids = token_ids[:self.max_len]

        return {
            'input_ids': torch.tensor(token_ids, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

    def __len__(self):
        return len(self.data)



In [6]:
# build vocab
vocab = build_vocab(df['review'].tolist())

# build dataset
dataset = IMDBDataset(df, vocab, simple_tokenizer, max_len=100)


In [7]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

 
for batch in dataloader:
    print(batch['input_ids'].shape)  # torch.Size([32, 100])
    print(batch['label'].shape)      # torch.Size([32])
    break

torch.Size([32, 100])
torch.Size([32])


### Model

In [8]:
class IMDBClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128, num_classes=2):
        super(IMDBClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, input_ids):
        # input_ids: [batch_size, seq_len]
        # embedding
        embedded = self.embedding(input_ids)  # [batch_size, seq_len, embedding_dim]
        # LSTM
        output, (hidden, cell) = self.lstm(embedded)  # hidden: [1, batch_size, hidden_dim]
        final_hidden = hidden[-1]  # [batch_size, hidden_dim]
        logits = self.fc(final_hidden)  # [batch_size, num_classes]
        return logits


In [9]:
vocab_size = len(vocab)
model = IMDBClassifier(vocab_size=vocab_size, embedding_dim=128, hidden_dim=128, num_classes=2)

In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Dataset: 文字 → token → id <br>
↓<br>
Model:<br>
Embedding → LSTM → Linear<br>
↓<br>
CrossEntropyLoss + Optimizer<br>


In [11]:
for epoch in range(5):
    model.train()

    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        # Forward
        outputs = model(input_ids)   
        # Loss
        loss = criterion(outputs, labels)
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

 
        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)  
        total_correct += (preds == labels).sum().item()
        total_samples += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    acc = total_correct / total_samples

    print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}, Accuracy = {acc:.4f}")



Epoch 1: Loss = 0.6578, Accuracy = 0.5998
Epoch 2: Loss = 0.4832, Accuracy = 0.7797
Epoch 3: Loss = 0.3637, Accuracy = 0.8420
Epoch 4: Loss = 0.3001, Accuracy = 0.8752
Epoch 5: Loss = 0.2397, Accuracy = 0.9051


In [12]:
def predict_sentiment(model, vocab, tokenizer, text, max_len=100, device='cpu'):
    model.eval()  
    text = re.sub(r'<.*?>', '', text)
    tokens = tokenizer(text)
    token_ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    if len(token_ids) < max_len:
        token_ids += [vocab['<PAD>']] * (max_len - len(token_ids))
    else:
        token_ids = token_ids[:max_len]
    input_ids = torch.tensor([token_ids], dtype=torch.long).to(device)  # shape: [1, max_len]



    #  forward
    with torch.no_grad():
        outputs = model(input_ids)  # shape: [1, 2]

        # softmax  
        probs = F.softmax(outputs, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()
        confidence = probs[0, pred_class].item()

    #  output ...
    label_map = {0: "negative", 1: "positive"}
    pred_label = label_map[pred_class]

    return pred_label, confidence



"This movie is fantastic! I really loved it."

In [14]:
text = input()
label, prob = predict_sentiment(model, vocab, simple_tokenizer, text, max_len=100, device=device)
print(f"Preduct Result: {label}, Prob: {prob:.4f}")


Preduct Result: positive, Prob: 0.9706
