In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader , Dataset

import numpy as np
import pandas as pd
import re


In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(df[['review', 'sentiment']].head())

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


Text → tokenizer → vocab lookup → embedding → LSTM → hidden layer → linear → softmax → Classify

### Text Processing

In [3]:
from tokenizer import simple_tokenizer
from vocab import build_vocab , tokens_to_ids
from IMDBdataset import IMDBDataset
texts = df['review'].tolist()
# build vocab
vocab = build_vocab(df['review'].tolist())
# build dataset
dataset = IMDBDataset(df, vocab, simple_tokenizer, max_len=100)

In [4]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
for batch in dataloader:
    print(batch['input_ids'].shape)  # torch.Size([32, 100])
    print(batch['label'].shape)      # torch.Size([32])
    break

torch.Size([32, 100])
torch.Size([32])


### Model

In [5]:
from model import IMDBClassifier
vocab_size = len(vocab)
model = IMDBClassifier(vocab_size=vocab_size, embedding_dim=128, hidden_dim=128, num_classes=2)

### Train

In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Dataset: 文字 → token → id <br>
↓<br>
Model:<br>
Embedding → LSTM → Linear<br>
↓<br>
CrossEntropyLoss + Optimizer<br>


In [7]:
for epoch in range(20):
    model.train()

    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        # Forward
        outputs = model(input_ids)   
        # Loss
        loss = criterion(outputs, labels)
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

 
        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)  
        total_correct += (preds == labels).sum().item()
        total_samples += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    acc = total_correct / total_samples

    print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}, Accuracy = {acc:.4f}")



Epoch 1: Loss = 0.6767, Accuracy = 0.5651
Epoch 2: Loss = 0.5344, Accuracy = 0.7467
Epoch 3: Loss = 0.3956, Accuracy = 0.8287
Epoch 4: Loss = 0.3263, Accuracy = 0.8636
Epoch 5: Loss = 0.2697, Accuracy = 0.8905
Epoch 6: Loss = 0.2172, Accuracy = 0.9154
Epoch 7: Loss = 0.1575, Accuracy = 0.9403
Epoch 8: Loss = 0.1074, Accuracy = 0.9623
Epoch 9: Loss = 0.0746, Accuracy = 0.9757
Epoch 10: Loss = 0.0509, Accuracy = 0.9838
Epoch 11: Loss = 0.0387, Accuracy = 0.9884
Epoch 12: Loss = 0.0315, Accuracy = 0.9910
Epoch 13: Loss = 0.0277, Accuracy = 0.9916
Epoch 14: Loss = 0.0224, Accuracy = 0.9929
Epoch 15: Loss = 0.0252, Accuracy = 0.9926
Epoch 16: Loss = 0.0201, Accuracy = 0.9947
Epoch 17: Loss = 0.0143, Accuracy = 0.9962
Epoch 18: Loss = 0.0204, Accuracy = 0.9939
Epoch 19: Loss = 0.0168, Accuracy = 0.9953
Epoch 20: Loss = 0.0208, Accuracy = 0.9938


In [15]:
torch.save(model.state_dict(), 'imdb_classifier.pth')
model = IMDBClassifier(vocab_size=vocab_size, embedding_dim=128, hidden_dim=128, num_classes=2)
model.load_state_dict(torch.load('imdb_classifier.pth'))
model.to(device)

  model.load_state_dict(torch.load('imdb_classifier.pth'))


IMDBClassifier(
  (embedding): Embedding(10000, 128, padding_idx=0)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [16]:
def predict_sentiment(model, vocab, tokenizer, text, max_len=100, device='cpu'):
    model.eval()  
    text = re.sub(r'<.*?>', '', text)
    tokens = tokenizer(text)
    token_ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    if len(token_ids) < max_len:
        token_ids += [vocab['<PAD>']] * (max_len - len(token_ids))
    else:
        token_ids = token_ids[:max_len]
    input_ids = torch.tensor([token_ids], dtype=torch.long).to(device)  # shape: [1, max_len]



    #  forward
    with torch.no_grad():
        outputs = model(input_ids)  # shape: [1, 2]

        # softmax  
        probs = F.softmax(outputs, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()
        confidence = probs[0, pred_class].item()

    #  output ...
    label_map = {0: "negative", 1: "positive"}
    pred_label = label_map[pred_class]

    return pred_label, confidence



"This movie is fantastic! I really loved it."

In [17]:
text = "This movie is fantastic! I really loved it." #input()
label, prob = predict_sentiment(model, vocab, simple_tokenizer, text, max_len=100, device=device)
print(f"Preduct Result: {label}, Prob: {prob:.4f}")


Preduct Result: positive, Prob: 0.9995
