In [3]:
!pip uninstall torch -y

Found existing installation: torch 2.4.0+cu121
Uninstalling torch-2.4.0+cu121:
  Successfully uninstalled torch-2.4.0+cu121


In [1]:
import torch
print(torch.__version__)

2.2.0+cu121


In [4]:
!pip install torchtext==0.17

Collecting torchtext==0.17
  Downloading torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting torch==2.2.0 (from torchtext==0.17)
  Using cached torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchdata==0.7.1 (from torchtext==0.17)
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0->torchtext==0.17)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nccl-cu12==2.19.3 (from torch==2.2.0->torchtext==0.17)
  Using cached nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)
Collecting triton==2.2.0 (from torch==2.2.0->torchtext==0.17)
  Using cached triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-an

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np

# 하이퍼파라미터 설정
BATCH_SIZE = 32
EMBEDDING_DIM = 100  # GloVe 임베딩 차원과 일치시킴
NUM_CLASSES = 3  # 클래스 수 (negative, neutral, positive)
NUM_EPOCHS = 20
LEARNING_RATE = 5e-4
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터셋 로드
dataset = load_dataset('financial_phrasebank', 'sentences_allagree')

# 데이터프레임으로 변환
data = dataset['train'].to_pandas()

# 클래스 이름 가져오기
label_names = dataset['train'].features['label'].names

# 데이터 분할
train_data, temp_data = train_test_split(
    data, test_size=0.2, stratify=data['label'], random_state=42)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

# 어휘 사전 구축
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for sentence in data_iter:
        yield tokenizer(sentence)

vocab = build_vocab_from_iterator(
    yield_tokens(train_data['sentence']),
    max_tokens=MAX_VOCAB_SIZE,
    specials=['<pad>', '<unk>']
)
vocab.set_default_index(vocab['<unk>'])

# 사전 학습된 GloVe 임베딩 로드
glove = GloVe(name='6B', dim=EMBEDDING_DIM)

# 임베딩 매트릭스 생성
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove[token]
    else:
        embedding_matrix[i] = torch.randn(EMBEDDING_DIM)

# 데이터셋 클래스 정의
class FinancialPhraseBankDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data.reset_index(drop=True)
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.loc[idx, 'sentence']
        label = self.data.loc[idx, 'label']
        tokens = self.tokenizer(sentence)
        token_ids = [self.vocab[token] for token in tokens]
        # 시퀀스 길이 조정 및 패딩
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [self.vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return text_tensor, label_tensor

# 데이터셋 및 데이터 로더 생성
train_dataset = FinancialPhraseBankDataset(train_data, vocab, tokenizer)
val_dataset = FinancialPhraseBankDataset(val_data, vocab, tokenizer)
test_dataset = FinancialPhraseBankDataset(test_data, vocab, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 클래스 가중치 계산
class_weights = compute_class_weight(
    class_weight='balanced', classes=np.unique(train_data['label']), y=train_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# 모델 정의
class CNNTransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
        super(CNNTransformerModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
        )
        self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

        # CNN 인코더
        self.cnn_encoder = nn.Conv1d(
            embedding_dim, embedding_dim, kernel_size=3, padding=1, stride=2
        )
        self.cnn_encoder_residual = nn.Conv1d(
            embedding_dim, embedding_dim, kernel_size=1, stride=2
        )

        # 트랜스포머 인코더 레이어
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=4, dropout=0.1, activation='relu', batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=1, norm=nn.LayerNorm(embedding_dim)
        )

        # CNN 디코더
        self.cnn_decoder = nn.ConvTranspose1d(
            embedding_dim, embedding_dim, kernel_size=3, padding=1, stride=2, output_padding=1
        )
        self.cnn_decoder_residual = nn.ConvTranspose1d(
            embedding_dim, embedding_dim, kernel_size=1, stride=2, output_padding=1
        )

        # 출력 레이어
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        batch_size, seq_len = x.size()
        x = self.embedding(x)
        positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
        x = x + self.position_embedding(positions)
        x = x.permute(0, 2, 1)
        # CNN 인코더와 잔차 연결
        residual = self.cnn_encoder_residual(x)
        x = self.cnn_encoder(x)
        x = nn.ReLU()(x + residual)
        x = x.permute(0, 2, 1)
        # 패딩 마스크 생성
        src_key_padding_mask = (x.abs().sum(dim=2) == 0)
        # 트랜스포머 인코더
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)
        x = x.permute(0, 2, 1)
        # CNN 디코더와 잔차 연결
        residual = self.cnn_decoder_residual(x)
        x = self.cnn_decoder(x)
        x = nn.ReLU()(x + residual)
        # 글로벌 평균 풀링
        x = x.mean(dim=2)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

# 모델 초기화
model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 학습 루프
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for texts, labels in progress_bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}")

    # 검증 데이터로 평가
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_accuracy = correct / total
    print(f"Validation Accuracy after Epoch {epoch+1}: {val_accuracy * 100:.2f}%\n")

# 테스트 데이터로 평가
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 분류 보고서 출력
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=label_names))

torch.save(model.state_dict(), 'cnn_transformer_model-finance.pth')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


financial_phrasebank.py:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

The repository for financial_phrasebank contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/financial_phrasebank.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

.vector_cache/glove.6B.zip: 862MB [02:39, 5.39MB/s]                           
100%|█████████▉| 399999/400000 [00:20<00:00, 19515.26it/s]
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Epoch 1/20: 100%|██████████| 57/57 [00:01<00:00, 47.30it/s, loss=1.09]
  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


Epoch [1/20], Average Loss: 1.1096
Validation Accuracy after Epoch 1: 25.22%



Epoch 2/20: 100%|██████████| 57/57 [00:00<00:00, 81.35it/s, loss=1.04]


Epoch [2/20], Average Loss: 1.1118
Validation Accuracy after Epoch 2: 41.59%



Epoch 3/20: 100%|██████████| 57/57 [00:00<00:00, 83.97it/s, loss=1.05]


Epoch [3/20], Average Loss: 0.9779
Validation Accuracy after Epoch 3: 60.18%



Epoch 4/20: 100%|██████████| 57/57 [00:00<00:00, 82.72it/s, loss=0.613]


Epoch [4/20], Average Loss: 0.8668
Validation Accuracy after Epoch 4: 72.57%



Epoch 5/20: 100%|██████████| 57/57 [00:00<00:00, 82.23it/s, loss=0.687]


Epoch [5/20], Average Loss: 0.7452
Validation Accuracy after Epoch 5: 66.37%



Epoch 6/20: 100%|██████████| 57/57 [00:00<00:00, 83.38it/s, loss=0.55]


Epoch [6/20], Average Loss: 0.6787
Validation Accuracy after Epoch 6: 72.12%



Epoch 7/20: 100%|██████████| 57/57 [00:00<00:00, 81.72it/s, loss=0.81]


Epoch [7/20], Average Loss: 0.6293
Validation Accuracy after Epoch 7: 73.45%



Epoch 8/20: 100%|██████████| 57/57 [00:00<00:00, 83.04it/s, loss=0.405]


Epoch [8/20], Average Loss: 0.5889
Validation Accuracy after Epoch 8: 74.34%



Epoch 9/20: 100%|██████████| 57/57 [00:00<00:00, 81.29it/s, loss=0.614]


Epoch [9/20], Average Loss: 0.5510
Validation Accuracy after Epoch 9: 76.11%



Epoch 10/20: 100%|██████████| 57/57 [00:00<00:00, 82.08it/s, loss=0.423]


Epoch [10/20], Average Loss: 0.4763
Validation Accuracy after Epoch 10: 74.34%



Epoch 11/20: 100%|██████████| 57/57 [00:00<00:00, 80.71it/s, loss=0.167]


Epoch [11/20], Average Loss: 0.4260
Validation Accuracy after Epoch 11: 83.19%



Epoch 12/20: 100%|██████████| 57/57 [00:00<00:00, 83.90it/s, loss=0.682]


Epoch [12/20], Average Loss: 0.3038
Validation Accuracy after Epoch 12: 83.19%



Epoch 13/20: 100%|██████████| 57/57 [00:00<00:00, 82.11it/s, loss=0.183]


Epoch [13/20], Average Loss: 0.2566
Validation Accuracy after Epoch 13: 85.84%



Epoch 14/20: 100%|██████████| 57/57 [00:00<00:00, 83.65it/s, loss=0.247]


Epoch [14/20], Average Loss: 0.1972
Validation Accuracy after Epoch 14: 85.84%



Epoch 15/20: 100%|██████████| 57/57 [00:00<00:00, 81.61it/s, loss=0.235]


Epoch [15/20], Average Loss: 0.1757
Validation Accuracy after Epoch 15: 84.51%



Epoch 16/20: 100%|██████████| 57/57 [00:00<00:00, 77.92it/s, loss=0.0282]


Epoch [16/20], Average Loss: 0.1205
Validation Accuracy after Epoch 16: 83.63%



Epoch 17/20: 100%|██████████| 57/57 [00:00<00:00, 84.05it/s, loss=0.471]


Epoch [17/20], Average Loss: 0.1613
Validation Accuracy after Epoch 17: 87.17%



Epoch 18/20: 100%|██████████| 57/57 [00:00<00:00, 82.54it/s, loss=0.0642]


Epoch [18/20], Average Loss: 0.1404
Validation Accuracy after Epoch 18: 84.51%



Epoch 19/20: 100%|██████████| 57/57 [00:00<00:00, 82.95it/s, loss=0.0954]


Epoch [19/20], Average Loss: 0.1172
Validation Accuracy after Epoch 19: 79.20%



Epoch 20/20: 100%|██████████| 57/57 [00:00<00:00, 84.37it/s, loss=0.245]


Epoch [20/20], Average Loss: 0.0956
Validation Accuracy after Epoch 20: 81.86%

Test Accuracy: 79.74%

Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.77      0.75        31
     neutral       0.97      0.80      0.87       139
    positive       0.58      0.81      0.68        57

    accuracy                           0.80       227
   macro avg       0.76      0.79      0.77       227
weighted avg       0.84      0.80      0.81       227



In [3]:
# 예측 함수 정의
def predict(text):
    model.eval()
    with torch.no_grad():
        tokens = tokenizer(text)
        token_ids = [vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(text_tensor)
        probabilities = nn.functional.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        class_names = label_names
        return class_names[predicted_class], probabilities.squeeze().cpu().numpy()

# 예시 문장 예측
sample_text = "The company's profits have increased significantly this quarter."
label, probs = predict(sample_text)
print(f"\nSentence: {sample_text}")
print(f"Predicted Sentiment: {label}")
print(f"Probabilities: Negative {probs[0]*100:.2f}%, Neutral {probs[1]*100:.2f}%, Positive {probs[2]*100:.2f}%")



Sentence: The company's profits have increased significantly this quarter.
Predicted Sentiment: positive
Probabilities: Negative 0.28%, Neutral 0.13%, Positive 99.59%
