In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# データのロード
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
print("Loading 20 newsgroups dataset for categories:")

twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories,
                                  shuffle=True,
                                  random_state=42)
twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories,
                                 shuffle=True,
                                 random_state=42)

# データの分割
X_train = twenty_train.data
y_train = twenty_train.target
X_test = twenty_test.data
y_test = twenty_test.target

Loading 20 newsgroups dataset for categories:


In [5]:
# トークン化と語彙の作成
def tokenize(text):
    return word_tokenize(text.lower())

all_tokens = [tokenize(text) for text in X_train]
vocab = Counter(token for tokens in all_tokens for token in tokens)
vocab = {word: idx + 1 for idx, (word, _) in enumerate(vocab.most_common())}
vocab_size = len(vocab) + 1

# インデックス化とパディング
def encode(text, vocab, max_len):
    tokens = tokenize(text)
    indices = [vocab.get(token, 0) for token in tokens]
    return indices[:max_len] + [0] * (max_len - len(indices))

max_len = 100 # 最大長を設定

# インデックス化とパディングの適用
X_train_indices = [encode(text, vocab, max_len) for text in X_train]
X_test_indices = [encode(text, vocab, max_len) for text in X_test]

# NumPy配列に変換
X_train_indices = np.array(X_train_indices)
X_test_indices = np.array(X_test_indices)

# Tensorに変換
X_train_tensor = torch.tensor(X_train_indices, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_indices, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [6]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        hidden = hidden.squeeze(0)
        return self.fc(hidden)

class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        hidden = hidden.squeeze(0)
        return self.fc(hidden)

# ハイパーパラメータの設定
embedding_dim = 50
hidden_dim = 64
output_dim = len(categories)

# モデルの初期化
# model = SimpleRNN(vocab_size, embedding_dim, hidden_dim, output_dim)
model = SimpleLSTM(vocab_size, embedding_dim, hidden_dim, output_dim)

In [7]:
# 損失関数とオプティマイザの設定
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# トレーニング
num_epochs = 30
batch_size = 32
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for i in range(0, len(X_train_tensor), batch_size):
        batch_X = X_train_tensor[i:i + batch_size]
        batch_y = y_train_tensor[i:i + batch_size]
        optimizer.zero_grad()
        predictions = model(batch_X)
        loss = criterion(predictions, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f'Epoch {epoch + 1}, Loss: {epoch_loss / len(X_train_tensor)}')

print("Training complete.")

Epoch 1, Loss: 0.04293836980041382
Epoch 2, Loss: 0.041756625190265
Epoch 3, Loss: 0.03905835566818582
Epoch 4, Loss: 0.03508927212598105
Epoch 5, Loss: 0.03242576962083538
Epoch 6, Loss: 0.030109584569191584
Epoch 7, Loss: 0.029146573001640162
Epoch 8, Loss: 0.02206704131417112
Epoch 9, Loss: 0.022354017013884456
Epoch 10, Loss: 0.019368481387804712
Epoch 11, Loss: 0.016074574417543983
Epoch 12, Loss: 0.016002351565861734
Epoch 13, Loss: 0.02140328346177652
Epoch 14, Loss: 0.012287540850673146
Epoch 15, Loss: 0.014321390933889068
Epoch 16, Loss: 0.009628863307850836
Epoch 17, Loss: 0.0073847896330748505
Epoch 18, Loss: 0.006122737980992804
Epoch 19, Loss: 0.004842637904816937
Epoch 20, Loss: 0.003995174526389215
Epoch 21, Loss: 0.003301525968805741
Epoch 22, Loss: 0.016234083547627023
Epoch 23, Loss: 0.011401455283270613
Epoch 24, Loss: 0.0076468823841572015
Epoch 25, Loss: 0.004345250137292235
Epoch 26, Loss: 0.0030486927750261687
Epoch 27, Loss: 0.0024218876880383565
Epoch 28, Loss:

In [9]:
y_pred = model(X_test_tensor)
print(y_pred)

tensor([[-2.8090, -1.2400,  4.7774, -0.9905],
        [-3.5529,  3.4132,  0.4723, -1.0061],
        [ 0.6659, -2.3519,  3.5514, -2.1321],
        ...,
        [-3.2773, -0.4995,  5.1732, -2.2450],
        [ 2.0873, -3.2320,  1.1048,  0.1990],
        [-3.8039,  2.0626,  1.8164, -0.4176]], grad_fn=<AddmmBackward0>)


In [27]:
!pip install torcheval

from torcheval.metrics.functional import multiclass_f1_score
# from torcheval.metrics import MulticlassF1Score


f1 = multiclass_f1_score(
    input=y_pred,
    target=y_test_tensor,
    num_classes=4,
    average="weighted"
).item()
# metric = MulticlassF1Score(num_classes=10, average="micro")
# metric.update(out, labels)
# f1 = metric.compute().item()
print("F1 score: {f1}")


0.659341037273407


'\nf1 = 0.4000000059604645\n'