In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np

# 하이퍼파라미터 설정
BATCH_SIZE = 32
EMBEDDING_DIM = 100  # GloVe 임베딩 차원과 일치시킴
NUM_CLASSES = 3  # 클래스 수 (negative, neutral, positive)
NUM_EPOCHS = 20
LEARNING_RATE = 5e-4
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터셋 로드
dataset = load_dataset('financial_phrasebank', 'sentences_allagree')

# 데이터프레임으로 변환
data = dataset['train'].to_pandas()

# 클래스 이름 가져오기
label_names = dataset['train'].features['label'].names

# 데이터 분할
train_data, temp_data = train_test_split(
    data, test_size=0.2, stratify=data['label'], random_state=42)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

# 어휘 사전 구축
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for sentence in data_iter:
        yield tokenizer(sentence)

vocab = build_vocab_from_iterator(
    yield_tokens(train_data['sentence']), 
    max_tokens=MAX_VOCAB_SIZE, 
    specials=['<pad>', '<unk>']
)
vocab.set_default_index(vocab['<unk>'])

# 사전 학습된 GloVe 임베딩 로드
glove = GloVe(name='6B', dim=EMBEDDING_DIM)

# 임베딩 매트릭스 생성
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove[token]
    else:
        embedding_matrix[i] = torch.randn(EMBEDDING_DIM)

# 데이터셋 클래스 정의
class FinancialPhraseBankDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data.reset_index(drop=True)
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.loc[idx, 'sentence']
        label = self.data.loc[idx, 'label']
        tokens = self.tokenizer(sentence)
        token_ids = [self.vocab[token] for token in tokens]
        # 시퀀스 길이 조정 및 패딩
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [self.vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return text_tensor, label_tensor

# 데이터셋 및 데이터 로더 생성
train_dataset = FinancialPhraseBankDataset(train_data, vocab, tokenizer)
val_dataset = FinancialPhraseBankDataset(val_data, vocab, tokenizer)
test_dataset = FinancialPhraseBankDataset(test_data, vocab, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 클래스 가중치 계산
class_weights = compute_class_weight(
    class_weight='balanced', classes=np.unique(train_data['label']), y=train_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# 모델 정의
class CNNTransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
        super(CNNTransformerModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
        )
        self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

        # CNN 인코더
        self.cnn_encoder = nn.Conv1d(
            embedding_dim, embedding_dim, kernel_size=3, padding=1, stride=2
        )
        self.cnn_encoder_residual = nn.Conv1d(
            embedding_dim, embedding_dim, kernel_size=1, stride=2
        )

        # 트랜스포머 인코더 레이어
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=4, dropout=0.1, activation='relu', batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=1, norm=nn.LayerNorm(embedding_dim)
        )

        # CNN 디코더
        self.cnn_decoder = nn.ConvTranspose1d(
            embedding_dim, embedding_dim, kernel_size=3, padding=1, stride=2, output_padding=1
        )
        self.cnn_decoder_residual = nn.ConvTranspose1d(
            embedding_dim, embedding_dim, kernel_size=1, stride=2, output_padding=1
        )

        # 출력 레이어
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        batch_size, seq_len = x.size()
        x = self.embedding(x)
        positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
        x = x + self.position_embedding(positions)
        x = x.permute(0, 2, 1)
        # CNN 인코더와 잔차 연결
        residual = self.cnn_encoder_residual(x)
        x = self.cnn_encoder(x)
        x = nn.ReLU()(x + residual)
        x = x.permute(0, 2, 1)
        # 패딩 마스크 생성
        src_key_padding_mask = (x.abs().sum(dim=2) == 0)
        # 트랜스포머 인코더
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)
        x = x.permute(0, 2, 1)
        # CNN 디코더와 잔차 연결
        residual = self.cnn_decoder_residual(x)
        x = self.cnn_decoder(x)
        x = nn.ReLU()(x + residual)
        # 글로벌 평균 풀링
        x = x.mean(dim=2)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

# 모델 초기화
model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 학습 루프
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for texts, labels in progress_bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}")

    # 검증 데이터로 평가
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_accuracy = correct / total
    print(f"Validation Accuracy after Epoch {epoch+1}: {val_accuracy * 100:.2f}%\n")

# 테스트 데이터로 평가
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 분류 보고서 출력
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=label_names))

torch.save(model.state_dict(), 'cnn_transformer_model-finance.pth')


Epoch 1/20: 100%|██████████| 57/57 [00:08<00:00,  7.11it/s, loss=1.08]


Epoch [1/20], Average Loss: 1.1177
Validation Accuracy after Epoch 1: 60.18%



Epoch 2/20: 100%|██████████| 57/57 [00:08<00:00,  6.86it/s, loss=1.04]


Epoch [2/20], Average Loss: 1.0967
Validation Accuracy after Epoch 2: 55.75%



Epoch 3/20: 100%|██████████| 57/57 [00:07<00:00,  7.26it/s, loss=0.649]


Epoch [3/20], Average Loss: 0.9281
Validation Accuracy after Epoch 3: 66.37%



Epoch 4/20: 100%|██████████| 57/57 [00:07<00:00,  7.27it/s, loss=0.617]


Epoch [4/20], Average Loss: 0.8433
Validation Accuracy after Epoch 4: 64.60%



Epoch 5/20: 100%|██████████| 57/57 [00:08<00:00,  7.06it/s, loss=0.894]


Epoch [5/20], Average Loss: 0.7558
Validation Accuracy after Epoch 5: 53.10%



Epoch 6/20: 100%|██████████| 57/57 [00:08<00:00,  6.70it/s, loss=0.664]


Epoch [6/20], Average Loss: 0.6848
Validation Accuracy after Epoch 6: 54.42%



Epoch 7/20: 100%|██████████| 57/57 [00:07<00:00,  7.19it/s, loss=0.638]


Epoch [7/20], Average Loss: 0.6748
Validation Accuracy after Epoch 7: 63.27%



Epoch 8/20: 100%|██████████| 57/57 [00:07<00:00,  7.27it/s, loss=0.506]


Epoch [8/20], Average Loss: 0.6335
Validation Accuracy after Epoch 8: 60.18%



Epoch 9/20: 100%|██████████| 57/57 [00:08<00:00,  7.01it/s, loss=0.563]


Epoch [9/20], Average Loss: 0.5614
Validation Accuracy after Epoch 9: 60.62%



Epoch 10/20: 100%|██████████| 57/57 [00:07<00:00,  7.23it/s, loss=0.88] 


Epoch [10/20], Average Loss: 0.5389
Validation Accuracy after Epoch 10: 51.33%



Epoch 11/20: 100%|██████████| 57/57 [00:08<00:00,  7.07it/s, loss=0.348]


Epoch [11/20], Average Loss: 0.4642
Validation Accuracy after Epoch 11: 58.41%



Epoch 12/20: 100%|██████████| 57/57 [00:08<00:00,  7.09it/s, loss=0.19] 


Epoch [12/20], Average Loss: 0.4161
Validation Accuracy after Epoch 12: 68.14%



Epoch 13/20: 100%|██████████| 57/57 [00:07<00:00,  7.22it/s, loss=0.287] 


Epoch [13/20], Average Loss: 0.2953
Validation Accuracy after Epoch 13: 62.39%



Epoch 14/20: 100%|██████████| 57/57 [00:07<00:00,  7.16it/s, loss=0.613] 


Epoch [14/20], Average Loss: 0.2629
Validation Accuracy after Epoch 14: 70.80%



Epoch 15/20: 100%|██████████| 57/57 [00:07<00:00,  7.15it/s, loss=0.429] 


Epoch [15/20], Average Loss: 0.2374
Validation Accuracy after Epoch 15: 68.58%



Epoch 16/20: 100%|██████████| 57/57 [00:08<00:00,  6.98it/s, loss=0.617] 


Epoch [16/20], Average Loss: 0.2184
Validation Accuracy after Epoch 16: 63.27%



Epoch 17/20: 100%|██████████| 57/57 [00:08<00:00,  7.08it/s, loss=0.342] 


Epoch [17/20], Average Loss: 0.1649
Validation Accuracy after Epoch 17: 67.70%



Epoch 18/20: 100%|██████████| 57/57 [00:07<00:00,  7.20it/s, loss=0.0133]


Epoch [18/20], Average Loss: 0.1173
Validation Accuracy after Epoch 18: 61.50%



Epoch 19/20: 100%|██████████| 57/57 [00:07<00:00,  7.20it/s, loss=0.155]  


Epoch [19/20], Average Loss: 0.1047
Validation Accuracy after Epoch 19: 63.27%



Epoch 20/20: 100%|██████████| 57/57 [00:07<00:00,  7.19it/s, loss=0.0983] 


Epoch [20/20], Average Loss: 0.1030
Validation Accuracy after Epoch 20: 61.50%

Test Accuracy: 64.76%

Classification Report:
              precision    recall  f1-score   support

    negative       0.50      0.74      0.60        31
     neutral       1.00      0.56      0.72       139
    positive       0.45      0.81      0.57        57

    accuracy                           0.65       227
   macro avg       0.65      0.70      0.63       227
weighted avg       0.79      0.65      0.67       227



In [None]:
''''
결과 분석
1.1 정확도 및 손실 값
최종 테스트 정확도: 약 64.76%
에포크별 손실 값: 초기에는 높은 손실 값에서 시작하여 에포크가 진행됨에 따라 손실이 감소하는 추세를 보입니다.
검증 정확도: 에포크마다 변동이 심하며, 꾸준히 상승하지 않고 불안정한 모습을 보입니다.
1.2 분류 보고서
Negative 클래스
정밀도(Precision): 0.50
재현율(Recall): 0.74
F1-스코어: 0.60
Neutral 클래스
정밀도: 1.00
재현율: 0.56
F1-스코어: 0.72
Positive 클래스
정밀도: 0.45
재현율: 0.81
F1-스코어: 0.57
1.3 주요 문제점
클래스 불균형: Neutral 클래스의 지원(Support) 수가 139로 다른 클래스에 비해 월등히 많습니다.
Neutral 클래스의 정밀도는 높지만 재현율이 낮음: 이는 모델이 Neutral로 예측한 것 중 실제로 맞은 비율은 높지만, 실제 Neutral인 것들을 많이 놓치고 있다는 의미입니다.
Positive 클래스의 재현율이 높지만 정밀도가 낮음: 모델이 Positive로 예측한 것 중 실제로 맞은 비율은 낮지만, 실제 Positive인 것들을 많이 찾아낸다는 의미입니다.
'''

In [15]:
# 예측 함수 정의
def predict(text):
    model.eval()
    with torch.no_grad():
        tokens = tokenizer(text)
        token_ids = [vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(text_tensor)
        probabilities = nn.functional.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        class_names = label_names
        return class_names[predicted_class], probabilities.squeeze().cpu().numpy()

# 예시 문장 예측
sample_text = "The company's profits have increased significantly this quarter."
label, probs = predict(sample_text)
print(f"\nSentence: {sample_text}")
print(f"Predicted Sentiment: {label}")
print(f"Probabilities: Negative {probs[0]*100:.2f}%, Neutral {probs[1]*100:.2f}%, Positive {probs[2]*100:.2f}%")




Sentence: The company's profits have increased significantly this quarter.
Predicted Sentiment: negative
Probabilities: Negative 93.08%, Neutral 0.00%, Positive 6.92%


In [8]:
data

Unnamed: 0,sentence,label
0,"According to Gran , the company has no plans t...",1
1,"For the last quarter of 2010 , Componenta 's n...",2
2,"In the third quarter of 2010 , net sales incre...",2
3,Operating profit rose to EUR 13.1 mn from EUR ...,2
4,"Operating profit totalled EUR 21.1 mn , up fro...",2
...,...,...
2259,Operating result for the 12-month period decre...,0
2260,HELSINKI Thomson Financial - Shares in Cargote...,0
2261,LONDON MarketWatch -- Share prices ended lower...,0
2262,Operating profit fell to EUR 35.4 mn from EUR ...,0
