In [2]:
# !pip uninstall torch -y

In [3]:
import torch
print(torch.__version__)

2.3.0


In [4]:
# !pip install torchtext==0.17

In [1]:
# !pip install datasets

In [None]:
#최조코드
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, build_vocab_from_iterator
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
import re

# 하이퍼파라미터 설정
BATCH_SIZE = 64
EMBEDDING_DIM = 200  # GloVe-Twitter 임베딩 차원 (200차원 벡터 사용)
NUM_CLASSES = 3  # 클래스 수 (Negative, Neutral, Positive)
NUM_EPOCHS = 20  # 학습 에폭 수
LEARNING_RATE = 1e-4  # 학습률
MAX_VOCAB_SIZE = 20000  # 최대 어휘 사전 크기
MAX_SEQ_LEN = 256  # 최대 시퀀스 길이

# 디바이스 설정 (GPU 사용 가능 시 GPU 사용)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 모델 타입 선택: 'hybrid', 'cnn', 'transformer'
model_type = 'hybrid'  # 현재는 하이브리드 모델 사용

# 데이터셋 로드
dataset1 = load_dataset('financial_phrasebank', 'sentences_allagree')  # 금융 문장 데이터셋
dataset2 = load_dataset('TimKoornstra/financial-tweets-sentiment')  # 금융 관련 트윗 데이터셋

# 데이터프레임으로 변환
data1 = dataset1['train'].to_pandas()
data2 = dataset2['train'].to_pandas()

# dataset2의 레이블 매핑 (레이블 통일을 위해)
label_mapping2 = {
    0: 1,  # Neutral -> Neutral (1)
    1: 2,  # Positive -> Positive (2)
    2: 0   # Negative -> Negative (0)
}

data2['label'] = data2['sentiment'].map(label_mapping2)  # 레이블 매핑 적용
data2 = data2.rename(columns={'tweet': 'sentence'})  # 열 이름 변경 (tweet -> sentence)
data2 = data2[['sentence', 'label']]  # 필요한 열만 선택

# 데이터 결합 (두 데이터셋 합치기)
combined_data = pd.concat([data1[['sentence', 'label']], data2], ignore_index=True)

# 데이터 분할 (훈련, 검증, 테스트 세트로 나누기)
train_data, temp_data = train_test_split(
    combined_data, test_size=0.2, stratify=combined_data['label'], random_state=42)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

# 데이터 전처리 함수 정의
def preprocess_text(text):
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # @멘션 및 해시태그 제거
    text = re.sub(r'\@\w+|\#', '', text)
    # 특수 문자 및 숫자 제거 (알파벳과 공백만 남김)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # 소문자로 변환 및 양쪽 공백 제거
    text = text.lower().strip()
    return text

# 전처리 적용 (훈련, 검증, 테스트 데이터에 대해)
train_data['sentence'] = train_data['sentence'].apply(preprocess_text)
val_data['sentence'] = val_data['sentence'].apply(preprocess_text)
test_data['sentence'] = test_data['sentence'].apply(preprocess_text)

# 어휘 사전 구축
tokenizer = get_tokenizer('basic_english')  # 기본 영어 토크나이저 사용

def yield_tokens(data_iter):
    for sentence in data_iter:
        yield tokenizer(str(sentence))  # 각 문장을 토큰화하여 반환

vocab = build_vocab_from_iterator(
    yield_tokens(train_data['sentence']),
    max_tokens=MAX_VOCAB_SIZE,
    specials=['<pad>', '<unk>']  # 패딩 토큰과 미등록 토큰 추가
)
vocab.set_default_index(vocab['<unk>'])  # 미등록 토큰에 대한 기본 인덱스 설정

# 사전 학습된 GloVe-Twitter 임베딩 로드 (Twitter 데이터로 학습된 임베딩 사용)
glove = GloVe(name='twitter.27B', dim=EMBEDDING_DIM)

# 임베딩 매트릭스 생성 (각 단어에 대한 임베딩 벡터 매핑)
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove[token]  # 사전에 있는 단어는 해당 임베딩 사용
    else:
        embedding_matrix[i] = torch.randn(EMBEDDING_DIM)  # 사전에 없는 단어는 랜덤 초기화

# 데이터셋 클래스 정의 (PyTorch Dataset 상속)
class FinancialDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data.reset_index(drop=True)
        self.vocab = vocab  # 어휘 사전
        self.tokenizer = tokenizer  # 토크나이저

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.loc[idx, 'sentence'])  # 문장 추출
        label = self.data.loc[idx, 'label']  # 레이블 추출
        tokens = self.tokenizer(sentence)  # 문장 토큰화
        token_ids = [self.vocab[token] for token in tokens]  # 토큰을 인덱스로 변환
        # 시퀀스 길이 조정 (패딩 또는 잘라내기)
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [self.vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long)  # 텐서로 변환
        label_tensor = torch.tensor(label, dtype=torch.long)
        return text_tensor, label_tensor  # 텍스트와 레이블 반환

# 데이터셋 및 데이터 로더 생성
train_dataset = FinancialDataset(train_data, vocab, tokenizer)
val_dataset = FinancialDataset(val_data, vocab, tokenizer)
test_dataset = FinancialDataset(test_data, vocab, tokenizer)

# DataLoader를 사용하여 배치 처리
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 클래스 가중치 계산 (데이터 불균형 대응을 위해)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_data['label']),
    y=train_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)  # 텐서로 변환하여 디바이스에 로드

# 모델 정의
if model_type == 'hybrid':
    # CNN과 트랜스포머를 결합한 하이브리드 모델 정의
    class CNNTransformerModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(CNNTransformerModel, self).__init__()
            # 임베딩 레이어 (사전 학습된 임베딩 사용)
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )
            # 위치 임베딩 (포지셔널 인코딩)
            self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

            # CNN 인코더 (지역적 특징 추출)
            self.cnn = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=3, padding=1)
            self.layer_norm_cnn = nn.LayerNorm(embedding_dim)

            # 트랜스포머 인코더 레이어 정의
            encoder_layer = nn.TransformerEncoderLayer(
                d_model=embedding_dim, nhead=10, dropout=0.1, activation='relu', batch_first=True
            )
            # 트랜스포머 인코더를 여러 층 쌓음 (깊은 모델)
            self.transformer_encoder = nn.TransformerEncoder(
                encoder_layer, num_layers=4, norm=nn.LayerNorm(embedding_dim)
            )

            # 출력 레이어 (분류를 위한)
            self.dropout = nn.Dropout(0.6)
            self.fc = nn.Linear(embedding_dim, num_classes)

        def forward(self, x):
            batch_size, seq_len = x.size()
            x = self.embedding(x)  # 단어 임베딩
            positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)  # 위치 인덱스 생성
            x = x + self.position_embedding(positions)  # 위치 임베딩 추가
            x = x.permute(0, 2, 1)  # [batch_size, embedding_dim, seq_len]으로 변환

            # CNN 인코더 적용 (지역적 특징 추출)
            x = self.cnn(x)
            x = x.permute(0, 2, 1)  # 다시 [batch_size, seq_len, embedding_dim]으로 변환
            x = self.layer_norm_cnn(x)  # 레이어 정규화
            x = nn.ReLU()(x)  # 활성화 함수 적용

            # 패딩 마스크 생성 (패딩된 부분을 마스킹)
            src_key_padding_mask = (x.abs().sum(dim=2) == 0)

            # 트랜스포머 인코더 적용 (전역적 문맥 학습)
            x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

            # 글로벌 평균 풀링 (시퀀스 차원 축소)
            x = x.mean(dim=1)
            x = self.dropout(x)
            logits = self.fc(x)  # 최종 출력
            return logits
    # 모델 인스턴스 생성 및 디바이스에 로드
    model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

elif model_type == 'cnn':
    # CNN 단독 모델 정의
    class CNNModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(CNNModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )

            # 다양한 커널 크기를 가진 CNN 레이어 사용 (다중 채널)
            self.convs = nn.ModuleList([
                nn.Conv1d(embedding_dim, 128, kernel_size=3, padding=1),
                nn.Conv1d(embedding_dim, 128, kernel_size=4, padding=2),
                nn.Conv1d(embedding_dim, 128, kernel_size=5, padding=2)
            ])
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Linear(128 * len(self.convs), num_classes)

        def forward(self, x):
            x = self.embedding(x)  # 단어 임베딩
            x = x.permute(0, 2, 1)  # [batch_size, embedding_dim, seq_len]
            # 각 CNN 레이어를 통과하여 특징 맵 생성
            x = [nn.ReLU()(conv(x)) for conv in self.convs]
            # 맥스 풀링으로 시퀀스 차원 축소
            x = [nn.functional.max_pool1d(feature_map, kernel_size=feature_map.shape[2]).squeeze(2) for feature_map in x]
            x = torch.cat(x, dim=1)  # 특징 맵들을 연결
            x = self.dropout(x)
            logits = self.fc(x)  # 최종 출력
            return logits
    model = CNNModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

elif model_type == 'transformer':
    # 트랜스포머 단독 모델 정의
    class TransformerModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(TransformerModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )
            self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

            # 트랜스포머 인코더 레이어 정의
            encoder_layer = nn.TransformerEncoderLayer(
                d_model=embedding_dim, nhead=8, dropout=0.1, activation='relu', batch_first=True
            )
            # 트랜스포머 인코더를 여러 층 쌓음
            self.transformer_encoder = nn.TransformerEncoder(
                encoder_layer, num_layers=4, norm=nn.LayerNorm(embedding_dim)
            )

            # 출력 레이어
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Linear(embedding_dim, num_classes)

        def forward(self, x):
            batch_size, seq_len = x.size()
            x = self.embedding(x)  # 단어 임베딩
            positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
            x = x + self.position_embedding(positions)  # 위치 임베딩 추가

            # 패딩 마스크 생성
            src_key_padding_mask = (x.abs().sum(dim=2) == 0)

            # 트랜스포머 인코더 적용
            x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

            # 글로벌 평균 풀링
            x = x.mean(dim=1)
            x = self.dropout(x)
            logits = self.fc(x)  # 최종 출력
            return logits
    model = TransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)  # 레이블 스무딩 적용
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)  # AdamW 옵티마이저 사용

# 조기 종료 설정
early_stopping_patience = 3  # 검증 손실이 개선되지 않는 에폭 수
best_val_loss = float('inf')  # 최상의 검증 손실 초기값
patience_counter = 0  # 인내 횟수 초기화

# 학습 루프 시작
for epoch in range(NUM_EPOCHS):
    model.train()  # 모델을 훈련 모드로 설정
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for texts, labels in progress_bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()  # 그래디언트 초기화
        outputs = model(texts)  # 모델 예측값 계산
        loss = criterion(outputs, labels)  # 손실 계산
        loss.backward()  # 역전파 수행
        optimizer.step()  # 가중치 업데이트
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)  # 평균 손실 계산
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}")

    # 검증 단계
    val_loss = 0
    correct = 0
    total = 0
    model.eval()  # 모델을 평가 모드로 설정
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)  # 예측된 클래스
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_loss /= len(val_loader)  # 평균 검증 손실 계산
    val_accuracy = correct / total  # 검증 정확도 계산
    print(f"Validation Loss after Epoch {epoch+1}: {val_loss:.4f}")
    print(f"Validation Accuracy after Epoch {epoch+1}: {val_accuracy * 100:.2f}%\n")

    # 조기 종료 조건 확인
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # 모델 저장 (최적의 검증 성능)
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered.")  # 조기 종료 메시지 출력
            break

# 저장된 모델 로드 (최적의 모델 사용)
model.load_state_dict(torch.load('best_model.pth'))

# 테스트 데이터로 평가
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)  # 예측된 클래스
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
test_accuracy = correct / total  # 테스트 정확도 계산
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 분류 보고서 출력 (정밀도, 재현율, F1-스코어 등)
print("\nClassification Report:")
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(all_labels, all_preds, target_names=target_names))

# 예측 함수 정의 및 테스트
def predict(text):
    model.eval()
    with torch.no_grad():
        text = preprocess_text(text)  # 입력 문장 전처리
        tokens = tokenizer(text)  # 토큰화
        token_ids = [vocab[token] for token in tokens]  # 토큰 인덱스로 변환
        # 시퀀스 길이 조정
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(text_tensor)
        probabilities = nn.functional.softmax(outputs, dim=1)  # 소프트맥스 적용
        predicted_class = torch.argmax(probabilities, dim=1).item()  # 예측된 클래스
        class_names = ['Negative', 'Neutral', 'Positive']
        return class_names[predicted_class], probabilities.squeeze().cpu().numpy()

# 예시 문장 예측
sample_text = "The company's profits have increased significantly this quarter."
label, probs = predict(sample_text)
print(f"\nSentence: {sample_text}")
print(f"Predicted Sentiment: {label}")
print(f"Probabilities: Negative {probs[0]*100:.2f}%, Neutral {probs[1]*100:.2f}%, Positive {probs[2]*100:.2f}%")


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np

# 하이퍼파라미터 설정
BATCH_SIZE = 32
EMBEDDING_DIM = 100  # GloVe 임베딩 차원과 일치시킴
NUM_CLASSES = 3  # 클래스 수 (negative, neutral, positive)
NUM_EPOCHS = 20
LEARNING_RATE = 5e-4
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터셋 로드
dataset = load_dataset('financial_phrasebank', 'sentences_allagree')

# 데이터프레임으로 변환
data = dataset['train'].to_pandas()

# 클래스 이름 가져오기
label_names = dataset['train'].features['label'].names

# 데이터 분할
train_data, temp_data = train_test_split(
    data, test_size=0.2, stratify=data['label'], random_state=42)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

# 어휘 사전 구축
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for sentence in data_iter:
        yield tokenizer(sentence)

vocab = build_vocab_from_iterator(
    yield_tokens(train_data['sentence']),
    max_tokens=MAX_VOCAB_SIZE,
    specials=['<pad>', '<unk>']
)
vocab.set_default_index(vocab['<unk>'])

# 사전 학습된 GloVe 임베딩 로드
glove = GloVe(name='6B', dim=EMBEDDING_DIM)

# 임베딩 매트릭스 생성
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove[token]
    else:
        embedding_matrix[i] = torch.randn(EMBEDDING_DIM)

# 데이터셋 클래스 정의
class FinancialPhraseBankDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data.reset_index(drop=True)
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.loc[idx, 'sentence']
        label = self.data.loc[idx, 'label']
        tokens = self.tokenizer(sentence)
        token_ids = [self.vocab[token] for token in tokens]
        # 시퀀스 길이 조정 및 패딩
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [self.vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return text_tensor, label_tensor

# 데이터셋 및 데이터 로더 생성
train_dataset = FinancialPhraseBankDataset(train_data, vocab, tokenizer)
val_dataset = FinancialPhraseBankDataset(val_data, vocab, tokenizer)
test_dataset = FinancialPhraseBankDataset(test_data, vocab, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 클래스 가중치 계산
class_weights = compute_class_weight(
    class_weight='balanced', classes=np.unique(train_data['label']), y=train_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# 모델 정의
class CNNTransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
        super(CNNTransformerModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
        )
        self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

        # CNN 인코더
        self.cnn_encoder = nn.Conv1d(
            embedding_dim, embedding_dim, kernel_size=3, padding=1, stride=2
        )
        self.cnn_encoder_residual = nn.Conv1d(
            embedding_dim, embedding_dim, kernel_size=1, stride=2
        )

        # 트랜스포머 인코더 레이어
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=4, dropout=0.1, activation='relu', batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=1, norm=nn.LayerNorm(embedding_dim)
        )

        # CNN 디코더
        self.cnn_decoder = nn.ConvTranspose1d(
            embedding_dim, embedding_dim, kernel_size=3, padding=1, stride=2, output_padding=1
        )
        self.cnn_decoder_residual = nn.ConvTranspose1d(
            embedding_dim, embedding_dim, kernel_size=1, stride=2, output_padding=1
        )

        # 출력 레이어
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        batch_size, seq_len = x.size()
        x = self.embedding(x)
        positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
        x = x + self.position_embedding(positions)
        x = x.permute(0, 2, 1)
        # CNN 인코더와 잔차 연결
        residual = self.cnn_encoder_residual(x)
        x = self.cnn_encoder(x)
        x = nn.ReLU()(x + residual)
        x = x.permute(0, 2, 1)
        # 패딩 마스크 생성
        src_key_padding_mask = (x.abs().sum(dim=2) == 0)
        # 트랜스포머 인코더
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)
        x = x.permute(0, 2, 1)
        # CNN 디코더와 잔차 연결
        residual = self.cnn_decoder_residual(x)
        x = self.cnn_decoder(x)
        x = nn.ReLU()(x + residual)
        # 글로벌 평균 풀링
        x = x.mean(dim=2)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

# 모델 초기화
model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 학습 루프
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for texts, labels in progress_bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}")

    # 검증 데이터로 평가
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_accuracy = correct / total
    print(f"Validation Accuracy after Epoch {epoch+1}: {val_accuracy * 100:.2f}%\n")

# 테스트 데이터로 평가
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 분류 보고서 출력
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=label_names))

torch.save(model.state_dict(), 'cnn_transformer_model-finance.pth')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


financial_phrasebank.py:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

The repository for financial_phrasebank contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/financial_phrasebank.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

.vector_cache/glove.6B.zip: 862MB [02:39, 5.39MB/s]                           
100%|█████████▉| 399999/400000 [00:20<00:00, 19515.26it/s]
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Epoch 1/20: 100%|██████████| 57/57 [00:01<00:00, 47.30it/s, loss=1.09]
  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


Epoch [1/20], Average Loss: 1.1096
Validation Accuracy after Epoch 1: 25.22%



Epoch 2/20: 100%|██████████| 57/57 [00:00<00:00, 81.35it/s, loss=1.04]


Epoch [2/20], Average Loss: 1.1118
Validation Accuracy after Epoch 2: 41.59%



Epoch 3/20: 100%|██████████| 57/57 [00:00<00:00, 83.97it/s, loss=1.05]


Epoch [3/20], Average Loss: 0.9779
Validation Accuracy after Epoch 3: 60.18%



Epoch 4/20: 100%|██████████| 57/57 [00:00<00:00, 82.72it/s, loss=0.613]


Epoch [4/20], Average Loss: 0.8668
Validation Accuracy after Epoch 4: 72.57%



Epoch 5/20: 100%|██████████| 57/57 [00:00<00:00, 82.23it/s, loss=0.687]


Epoch [5/20], Average Loss: 0.7452
Validation Accuracy after Epoch 5: 66.37%



Epoch 6/20: 100%|██████████| 57/57 [00:00<00:00, 83.38it/s, loss=0.55]


Epoch [6/20], Average Loss: 0.6787
Validation Accuracy after Epoch 6: 72.12%



Epoch 7/20: 100%|██████████| 57/57 [00:00<00:00, 81.72it/s, loss=0.81]


Epoch [7/20], Average Loss: 0.6293
Validation Accuracy after Epoch 7: 73.45%



Epoch 8/20: 100%|██████████| 57/57 [00:00<00:00, 83.04it/s, loss=0.405]


Epoch [8/20], Average Loss: 0.5889
Validation Accuracy after Epoch 8: 74.34%



Epoch 9/20: 100%|██████████| 57/57 [00:00<00:00, 81.29it/s, loss=0.614]


Epoch [9/20], Average Loss: 0.5510
Validation Accuracy after Epoch 9: 76.11%



Epoch 10/20: 100%|██████████| 57/57 [00:00<00:00, 82.08it/s, loss=0.423]


Epoch [10/20], Average Loss: 0.4763
Validation Accuracy after Epoch 10: 74.34%



Epoch 11/20: 100%|██████████| 57/57 [00:00<00:00, 80.71it/s, loss=0.167]


Epoch [11/20], Average Loss: 0.4260
Validation Accuracy after Epoch 11: 83.19%



Epoch 12/20: 100%|██████████| 57/57 [00:00<00:00, 83.90it/s, loss=0.682]


Epoch [12/20], Average Loss: 0.3038
Validation Accuracy after Epoch 12: 83.19%



Epoch 13/20: 100%|██████████| 57/57 [00:00<00:00, 82.11it/s, loss=0.183]


Epoch [13/20], Average Loss: 0.2566
Validation Accuracy after Epoch 13: 85.84%



Epoch 14/20: 100%|██████████| 57/57 [00:00<00:00, 83.65it/s, loss=0.247]


Epoch [14/20], Average Loss: 0.1972
Validation Accuracy after Epoch 14: 85.84%



Epoch 15/20: 100%|██████████| 57/57 [00:00<00:00, 81.61it/s, loss=0.235]


Epoch [15/20], Average Loss: 0.1757
Validation Accuracy after Epoch 15: 84.51%



Epoch 16/20: 100%|██████████| 57/57 [00:00<00:00, 77.92it/s, loss=0.0282]


Epoch [16/20], Average Loss: 0.1205
Validation Accuracy after Epoch 16: 83.63%



Epoch 17/20: 100%|██████████| 57/57 [00:00<00:00, 84.05it/s, loss=0.471]


Epoch [17/20], Average Loss: 0.1613
Validation Accuracy after Epoch 17: 87.17%



Epoch 18/20: 100%|██████████| 57/57 [00:00<00:00, 82.54it/s, loss=0.0642]


Epoch [18/20], Average Loss: 0.1404
Validation Accuracy after Epoch 18: 84.51%



Epoch 19/20: 100%|██████████| 57/57 [00:00<00:00, 82.95it/s, loss=0.0954]


Epoch [19/20], Average Loss: 0.1172
Validation Accuracy after Epoch 19: 79.20%



Epoch 20/20: 100%|██████████| 57/57 [00:00<00:00, 84.37it/s, loss=0.245]


Epoch [20/20], Average Loss: 0.0956
Validation Accuracy after Epoch 20: 81.86%

Test Accuracy: 79.74%

Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.77      0.75        31
     neutral       0.97      0.80      0.87       139
    positive       0.58      0.81      0.68        57

    accuracy                           0.80       227
   macro avg       0.76      0.79      0.77       227
weighted avg       0.84      0.80      0.81       227



In [None]:
# 예측 함수 정의
def predict(text):
    model.eval()
    with torch.no_grad():
        tokens = tokenizer(text)
        token_ids = [vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(text_tensor)
        probabilities = nn.functional.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        class_names = label_names
        return class_names[predicted_class], probabilities.squeeze().cpu().numpy()

# 예시 문장 예측
sample_text = "The company's profits have increased significantly this quarter."
label, probs = predict(sample_text)
print(f"\nSentence: {sample_text}")
print(f"Predicted Sentiment: {label}")
print(f"Probabilities: Negative {probs[0]*100:.2f}%, Neutral {probs[1]*100:.2f}%, Positive {probs[2]*100:.2f}%")



Sentence: The company's profits have increased significantly this quarter.
Predicted Sentiment: positive
Probabilities: Negative 0.28%, Neutral 0.13%, Positive 99.59%


In [None]:
# 기존 데이터셋 로드
dataset1 = load_dataset('financial_phrasebank', 'sentences_allagree')

# 새로운 데이터셋 로드
dataset2 = load_dataset('TimKoornstra/financial-tweets-sentiment')


README.md:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/38091 [00:00<?, ? examples/s]

In [None]:
# 데이터프레임으로 변환
data1 = dataset1['train'].to_pandas()
data2 = dataset2['train'].to_pandas()


In [None]:
data2['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,17368
0,12181
2,8542


In [None]:
data2.columns

Index(['tweet', 'sentiment', 'url'], dtype='object')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
import re
from collections import Counter

# 하이퍼파라미터 설정
BATCH_SIZE = 32
EMBEDDING_DIM = 200  # GloVe-Twitter 임베딩 차원
NUM_CLASSES = 3  # 클래스 수 (Negative, Neutral, Positive)
NUM_EPOCHS = 20
LEARNING_RATE = 1e-4
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터셋 로드
dataset1 = load_dataset('financial_phrasebank', 'sentences_allagree')
dataset2 = load_dataset('TimKoornstra/financial-tweets-sentiment')

# 데이터프레임으로 변환
data1 = dataset1['train'].to_pandas()
data2 = dataset2['train'].to_pandas()

# dataset2의 레이블 매핑
label_mapping2 = {
    0: 1,  # Neutral -> Neutral (1)
    1: 2,  # Positive -> Positive (2)
    2: 0   # Negative -> Negative (0)
}

data2['label'] = data2['sentiment'].map(label_mapping2)
data2 = data2.rename(columns={'tweet': 'sentence'})
data2 = data2[['sentence', 'label']]  # 필요한 열만 선택

# 데이터 결합
combined_data = pd.concat([data1, data2], ignore_index=True)

# 데이터 분할
train_data, temp_data = train_test_split(
    combined_data, test_size=0.2, stratify=combined_data['label'], random_state=42)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

# 클래스 분포 출력
print('Train dataset shape %s' % Counter(train_data['label']))
print('Validation dataset shape %s' % Counter(val_data['label']))
print('Test dataset shape %s' % Counter(test_data['label']))

# 데이터 전처리 함수
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower().strip()
    return text

# 어휘 사전 구축
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for sentence in data_iter:
        yield tokenizer(preprocess_text(str(sentence)))

vocab = build_vocab_from_iterator(
    yield_tokens(train_data['sentence']),
    max_tokens=MAX_VOCAB_SIZE,
    specials=['<pad>', '<unk>']
)
vocab.set_default_index(vocab['<unk>'])

# 사전 학습된 GloVe-Twitter 임베딩 로드
glove = GloVe(name='twitter.27B', dim=EMBEDDING_DIM)

# 임베딩 매트릭스 생성
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove[token]
    else:
        embedding_matrix[i] = torch.randn(EMBEDDING_DIM)

# 데이터셋 클래스 정의
class FinancialDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data.reset_index(drop=True)
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = preprocess_text(str(self.data.loc[idx, 'sentence']))
        label = self.data.loc[idx, 'label']
        tokens = self.tokenizer(sentence)
        token_ids = [self.vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return text_tensor, label_tensor

# 데이터셋 및 데이터 로더 생성
train_dataset = FinancialDataset(train_data, vocab, tokenizer)
val_dataset = FinancialDataset(val_data, vocab, tokenizer)
test_dataset = FinancialDataset(test_data, vocab, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 클래스 가중치 계산
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_data['label']),
    y=train_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# 모델 정의
class CNNTransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
        super(CNNTransformerModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
        )
        self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

        # CNN 인코더
        self.cnn_encoder = nn.Conv1d(
            embedding_dim, embedding_dim, kernel_size=3, padding=1, stride=1
        )
        self.cnn_encoder_residual = nn.Conv1d(
            embedding_dim, embedding_dim, kernel_size=1, stride=1
        )

        # 트랜스포머 인코더 레이어
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=10, dropout=0.1, activation='relu', batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=1, norm=nn.LayerNorm(embedding_dim)
        )

        # CNN 디코더
        self.cnn_decoder = nn.Conv1d(
            embedding_dim, embedding_dim, kernel_size=3, padding=1
        )
        self.cnn_decoder_residual = nn.Conv1d(
            embedding_dim, embedding_dim, kernel_size=1
        )

        # 출력 레이어
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        batch_size, seq_len = x.size()
        x = self.embedding(x)
        positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
        x = x + self.position_embedding(positions)
        x = x.permute(0, 2, 1)  # [batch_size, embedding_dim, seq_len]

        # CNN 인코더와 잔차 연결
        residual = self.cnn_encoder_residual(x)
        x = self.cnn_encoder(x)
        x = nn.ReLU()(x + residual)

        x = x.permute(0, 2, 1)  # [batch_size, seq_len, embedding_dim]

        # 패딩 마스크 생성
        src_key_padding_mask = (x.abs().sum(dim=2) == 0)

        # 트랜스포머 인코더
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

        x = x.permute(0, 2, 1)  # [batch_size, embedding_dim, seq_len]

        # CNN 디코더와 잔차 연결
        residual = self.cnn_decoder_residual(x)
        x = self.cnn_decoder(x)
        x = nn.ReLU()(x + residual)

        # 글로벌 평균 풀링
        x = x.mean(dim=2)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

# 모델 초기화
model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# 학습 루프
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for texts, labels in progress_bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}")

    # 검증 데이터로 평가
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_accuracy = correct / total
    print(f"Validation Accuracy after Epoch {epoch+1}: {val_accuracy * 100:.2f}%\n")

# 테스트 데이터로 평가
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 분류 보고서 출력
print("\nClassification Report:")
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(all_labels, all_preds, target_names=target_names))

# 모델 저장
torch.save(model.state_dict(), 'cnn_transformer_model_finance.pth')

# 예측 함수 정의 및 테스트
def predict(text):
    model.eval()
    with torch.no_grad():
        text = preprocess_text(text)
        tokens = tokenizer(text)
        token_ids = [vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(text_tensor)
        probabilities = nn.functional.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        class_names = ['Negative', 'Neutral', 'Positive']
        return class_names[predicted_class], probabilities.squeeze().cpu().numpy()

# 예시 문장 예측
sample_text = "The company's profits have increased significantly this quarter."
label, probs = predict(sample_text)
print(f"\nSentence: {sample_text}")
print(f"Predicted Sentiment: {label}")
print(f"Probabilities: Negative {probs[0]*100:.2f}%, Neutral {probs[1]*100:.2f}%, Positive {probs[2]*100:.2f}%")


Train dataset shape Counter({2: 14350, 1: 10858, 0: 7076})
Validation dataset shape Counter({2: 1794, 1: 1357, 0: 884})
Test dataset shape Counter({2: 1794, 1: 1357, 0: 885})


.vector_cache/glove.twitter.27B.zip: 1.52GB [04:45, 5.32MB/s]                            
100%|█████████▉| 1193513/1193514 [01:46<00:00, 11197.49it/s]
Epoch 1/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.16it/s, loss=0.987]


Epoch [1/20], Average Loss: 1.0695
Validation Accuracy after Epoch 1: 48.43%



Epoch 2/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.35it/s, loss=0.633]


Epoch [2/20], Average Loss: 0.9161
Validation Accuracy after Epoch 2: 64.01%



Epoch 3/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.64it/s, loss=0.739]


Epoch [3/20], Average Loss: 0.8091
Validation Accuracy after Epoch 3: 70.66%



Epoch 4/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.81it/s, loss=0.767]


Epoch [4/20], Average Loss: 0.7563
Validation Accuracy after Epoch 4: 70.33%



Epoch 5/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.67it/s, loss=0.869]


Epoch [5/20], Average Loss: 0.7138
Validation Accuracy after Epoch 5: 73.73%



Epoch 6/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.75it/s, loss=0.693]


Epoch [6/20], Average Loss: 0.6805
Validation Accuracy after Epoch 6: 72.96%



Epoch 7/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.75it/s, loss=0.572]


Epoch [7/20], Average Loss: 0.6481
Validation Accuracy after Epoch 7: 71.50%



Epoch 8/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.82it/s, loss=0.736]


Epoch [8/20], Average Loss: 0.6184
Validation Accuracy after Epoch 8: 73.43%



Epoch 9/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.83it/s, loss=0.761]


Epoch [9/20], Average Loss: 0.5904
Validation Accuracy after Epoch 9: 73.58%



Epoch 10/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.84it/s, loss=0.568]


Epoch [10/20], Average Loss: 0.5663
Validation Accuracy after Epoch 10: 72.86%



Epoch 11/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.80it/s, loss=0.653]


Epoch [11/20], Average Loss: 0.5429
Validation Accuracy after Epoch 11: 72.14%



Epoch 12/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.84it/s, loss=0.435]


Epoch [12/20], Average Loss: 0.5204
Validation Accuracy after Epoch 12: 72.27%



Epoch 13/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.69it/s, loss=0.555]


Epoch [13/20], Average Loss: 0.5003
Validation Accuracy after Epoch 13: 73.53%



Epoch 14/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.54it/s, loss=0.307]


Epoch [14/20], Average Loss: 0.4813
Validation Accuracy after Epoch 14: 74.20%



Epoch 15/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.76it/s, loss=0.483]


Epoch [15/20], Average Loss: 0.4644
Validation Accuracy after Epoch 15: 73.09%



Epoch 16/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.85it/s, loss=0.479]


Epoch [16/20], Average Loss: 0.4563
Validation Accuracy after Epoch 16: 73.51%



Epoch 17/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.77it/s, loss=0.378]


Epoch [17/20], Average Loss: 0.4422
Validation Accuracy after Epoch 17: 72.81%



Epoch 18/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.82it/s, loss=0.445]


Epoch [18/20], Average Loss: 0.4354
Validation Accuracy after Epoch 18: 71.80%



Epoch 19/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.78it/s, loss=0.44]


Epoch [19/20], Average Loss: 0.4248
Validation Accuracy after Epoch 19: 72.71%



Epoch 20/20: 100%|██████████| 1009/1009 [00:22<00:00, 44.81it/s, loss=0.437]


Epoch [20/20], Average Loss: 0.4154
Validation Accuracy after Epoch 20: 72.71%

Test Accuracy: 73.22%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.64      0.68      0.66       885
     Neutral       0.74      0.75      0.75      1357
    Positive       0.78      0.74      0.76      1794

    accuracy                           0.73      4036
   macro avg       0.72      0.72      0.72      4036
weighted avg       0.73      0.73      0.73      4036


Sentence: The company's profits have increased significantly this quarter.
Predicted Sentiment: Positive
Probabilities: Negative 7.53%, Neutral 11.00%, Positive 81.47%


In [None]:
#CNN 레이어 구조 변경 및 하이퍼파라미터 수정 코드

# 하이퍼파라미터 설정
BATCH_SIZE = 64
EMBEDDING_DIM = 200  # GloVe-Twitter 임베딩 차원
NUM_CLASSES = 3
NUM_EPOCHS = 20
LEARNING_RATE = 5e-4
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 256


# 모델 정의
class CNNTransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
        super(CNNTransformerModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
        )
        self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

        # 멀티채널 CNN 인코더
        self.conv1 = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=4, padding=2)
        self.conv3 = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(embedding_dim)
        self.bn2 = nn.BatchNorm1d(embedding_dim)
        self.bn3 = nn.BatchNorm1d(embedding_dim)

        # 트랜스포머 인코더 레이어
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=10, dropout=0.2, activation='relu', batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=2, norm=nn.LayerNorm(embedding_dim)
        )

        # 출력 레이어
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(embedding_dim * 4, num_classes)  # CNN 3채널 + 트랜스포머 출력

    def forward(self, x):
        batch_size, seq_len = x.size()
        x = self.embedding(x)
        positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
        x = x + self.position_embedding(positions)
        x = x.permute(0, 2, 1)

        # 멀티채널 CNN 인코더
        x1 = nn.ReLU()(self.bn1(self.conv1(x)))
        x2 = nn.ReLU()(self.bn2(self.conv2(x)))
        x3 = nn.ReLU()(self.bn3(self.conv3(x)))

        # 풀링 및 결합
        x1 = nn.functional.max_pool1d(x1, kernel_size=x1.size(2)).squeeze(2)
        x2 = nn.functional.max_pool1d(x2, kernel_size=x2.size(2)).squeeze(2)
        x3 = nn.functional.max_pool1d(x3, kernel_size=x3.size(2)).squeeze(2)
        x_cnn = torch.cat((x1, x2, x3), dim=1)

        # 트랜스포머 인코더
        x = x.permute(0, 2, 1)
        src_key_padding_mask = (x.abs().sum(dim=2) == 0)
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

        # 트랜스포머 출력 풀링
        x_transformer = x.mean(dim=1)

        # CNN과 트랜스포머 출력 결합
        x = torch.cat((x_cnn, x_transformer), dim=1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

# 모델 초기화
model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# 학습 루프
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for texts, labels in progress_bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}")

    # 검증 데이터로 평가
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_accuracy = correct / total
    print(f"Validation Accuracy after Epoch {epoch+1}: {val_accuracy * 100:.2f}%\n")

# 테스트 데이터로 평가
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 분류 보고서 출력
print("\nClassification Report:")
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(all_labels, all_preds, target_names=target_names))

# 모델 저장
torch.save(model.state_dict(), 'cnn_transformer_model_finance_v0.2.pth')

# 예측 함수 정의 및 테스트
def predict(text):
    model.eval()
    with torch.no_grad():
        text = preprocess_text(text)
        tokens = tokenizer(text)
        token_ids = [vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(text_tensor)
        probabilities = nn.functional.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        class_names = ['Negative', 'Neutral', 'Positive']
        return class_names[predicted_class], probabilities.squeeze().cpu().numpy()

# 예시 문장 예측
sample_text = "The company's profits have increased significantly this quarter."
label, probs = predict(sample_text)
print(f"\nSentence: {sample_text}")
print(f"Predicted Sentiment: {label}")
print(f"Probabilities: Negative {probs[0]*100:.2f}%, Neutral {probs[1]*100:.2f}%, Positive {probs[2]*100:.2f}%")


Epoch 1/20: 100%|██████████| 1009/1009 [00:37<00:00, 26.92it/s, loss=0.967]


Epoch [1/20], Average Loss: 1.2577
Validation Accuracy after Epoch 1: 39.01%



Epoch 2/20: 100%|██████████| 1009/1009 [00:37<00:00, 27.27it/s, loss=0.73]


Epoch [2/20], Average Loss: 0.9581
Validation Accuracy after Epoch 2: 61.04%



Epoch 3/20: 100%|██████████| 1009/1009 [00:37<00:00, 27.26it/s, loss=0.99]


Epoch [3/20], Average Loss: 0.8118
Validation Accuracy after Epoch 3: 72.76%



Epoch 4/20: 100%|██████████| 1009/1009 [00:37<00:00, 27.03it/s, loss=0.437]


Epoch [4/20], Average Loss: 0.7090
Validation Accuracy after Epoch 4: 71.47%



Epoch 5/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.34it/s, loss=0.617]


Epoch [5/20], Average Loss: 0.6354
Validation Accuracy after Epoch 5: 73.11%



Epoch 6/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.38it/s, loss=0.64]


Epoch [6/20], Average Loss: 0.5846
Validation Accuracy after Epoch 6: 74.80%



Epoch 7/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.36it/s, loss=0.624]


Epoch [7/20], Average Loss: 0.5382
Validation Accuracy after Epoch 7: 73.46%



Epoch 8/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.45it/s, loss=0.641]


Epoch [8/20], Average Loss: 0.5039
Validation Accuracy after Epoch 8: 73.41%



Epoch 9/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.43it/s, loss=0.512]


Epoch [9/20], Average Loss: 0.4731
Validation Accuracy after Epoch 9: 73.18%



Epoch 10/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.46it/s, loss=0.438]


Epoch [10/20], Average Loss: 0.4498
Validation Accuracy after Epoch 10: 71.55%



Epoch 11/20: 100%|██████████| 1009/1009 [00:37<00:00, 27.01it/s, loss=0.492]


Epoch [11/20], Average Loss: 0.4288
Validation Accuracy after Epoch 11: 72.91%



Epoch 12/20: 100%|██████████| 1009/1009 [00:37<00:00, 27.20it/s, loss=0.436]


Epoch [12/20], Average Loss: 0.4137
Validation Accuracy after Epoch 12: 72.39%



Epoch 13/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.38it/s, loss=0.5]


Epoch [13/20], Average Loss: 0.4017
Validation Accuracy after Epoch 13: 71.50%



Epoch 14/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.44it/s, loss=0.384]


Epoch [14/20], Average Loss: 0.3973
Validation Accuracy after Epoch 14: 71.45%



Epoch 15/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.45it/s, loss=0.378]


Epoch [15/20], Average Loss: 0.3872
Validation Accuracy after Epoch 15: 70.41%



Epoch 16/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.47it/s, loss=0.386]


Epoch [16/20], Average Loss: 0.3822
Validation Accuracy after Epoch 16: 71.82%



Epoch 17/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.54it/s, loss=0.461]


Epoch [17/20], Average Loss: 0.3757
Validation Accuracy after Epoch 17: 71.55%



Epoch 18/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.53it/s, loss=0.31]


Epoch [18/20], Average Loss: 0.3741
Validation Accuracy after Epoch 18: 71.87%



Epoch 19/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.51it/s, loss=0.442]


Epoch [19/20], Average Loss: 0.3708
Validation Accuracy after Epoch 19: 70.76%



Epoch 20/20: 100%|██████████| 1009/1009 [00:36<00:00, 27.52it/s, loss=0.518]


Epoch [20/20], Average Loss: 0.3657
Validation Accuracy after Epoch 20: 70.46%

Test Accuracy: 69.40%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.57      0.66      0.61       885
     Neutral       0.74      0.64      0.69      1357
    Positive       0.73      0.75      0.74      1794

    accuracy                           0.69      4036
   macro avg       0.68      0.68      0.68      4036
weighted avg       0.70      0.69      0.69      4036


Sentence: The company's profits have increased significantly this quarter.
Predicted Sentiment: Positive
Probabilities: Negative 4.89%, Neutral 3.73%, Positive 91.38%


성능 향상을 위한 수정

1.1 CNN 레이어 구조 변경

1.1.1 필터 수 및 커널 크기 조정
필터 수 증가: CNN 레이어의 출력 채널 수를 늘려 특징 표현력을 향상시킵니다.
예: embedding_dim을 유지하면서, 추가적인 CNN 레이어를 추가하거나, 채널 수를 증가시킵니다.
커널 크기 다양화: 다양한 커널 크기를 사용하는 멀티채널 CNN 구조를 도입하여, 다양한 n-그램(n-gram) 특징을 포착합니다.
예: 커널 크기 [3, 4, 5]를 사용하는 여러 CNN 레이어를 병렬로 적용하고, 결과를 결합합니다.

1.1.2 CNN 레이어 추가
CNN 인코더에 추가적인 레이어를 쌓아 깊이를 늘립니다.
각 레이어마다 활성화 함수와 정규화 층을 추가하여 학습 안정성을 높입니다.

1.2 트랜스포머 모델 고도화

1.2.1 레이어 수 및 헤드 수 조정
트랜스포머 인코더 레이어 수 증가: num_layers를 1에서 2 또는 3으로 늘려 모델의 표현력을 높입니다.
어텐션 헤드 수 조정: nhead를 늘려 모델이 다양한 표현을 학습할 수 있도록 합니다.
단, embedding_dim이 nhead로 나누어 떨어져야 합니다.
예: embedding_dim=200, nhead=20

1.2.2 드롭아웃 비율 조정
드롭아웃 비율을 조정하여 과적합을 방지하고 일반화 성능을 향상시킵니다.
예: dropout=0.1에서 dropout=0.2로 증가

1.3 하이퍼파라미터 튜닝

1.3.1 학습률 조정
학습률을 약간 높여 학습 속도를 향상시키고, 지역 최솟값에 빠지는 것을 방지합니다.
예: LEARNING_RATE = 1e-4에서 LEARNING_RATE = 5e-4로 증가

1.3.2 배치 크기 조정
배치 크기를 늘려 학습의 안정성을 높입니다.
예: BATCH_SIZE = 32에서 BATCH_SIZE = 64로 증가

1.4 정규화 및 최적화 기법 적용

1.4.1 배치 정규화
CNN 레이어나 트랜스포머 레이어 사이에 **배치 정규화(Batch Normalization)**를 적용하여 학습을 안정화시킵니다.

1.4.2 옵티마이저 변경
AdamW에서 RAdam이나 Lookahead 옵티마이저를 사용하여 학습의 안정성을 높입니다.

1.5 임베딩 층 고정 및 미세조정
현재 임베딩 층은 학습 가능한 상태입니다.
**임베딩 층을 고정(freeze)**하여 학습 파라미터 수를 줄이고, 모델의 수렴을 빠르게 할 수 있습니다.
또는, **임베딩 층을 미세조정(fine-tuning)**하여 모델의 표현력을 높일 수 있습니다.

1.6 추가적인 데이터 전처리
텍스트 정규화를 더 강화하여 노이즈를 줄입니다.
예: 표제어 추출(lemmatization), 불용어 제거(stopword removal)
데이터 증강을 통해 데이터의 다양성을 높입니다.
예: 역번역(Back Translation), 동의어 치환

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, build_vocab_from_iterator  # 수정된 부분
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
import re

# 하이퍼파라미터 설정
BATCH_SIZE = 32
EMBEDDING_DIM = 200  # GloVe-Twitter 임베딩 차원
NUM_CLASSES = 3
NUM_EPOCHS = 20
LEARNING_RATE = 1e-4
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터셋 로드
dataset1 = load_dataset('financial_phrasebank', 'sentences_allagree')
dataset2 = load_dataset('TimKoornstra/financial-tweets-sentiment')

# 데이터프레임으로 변환
data1 = dataset1['train'].to_pandas()
data2 = dataset2['train'].to_pandas()

# dataset2의 레이블 매핑
label_mapping2 = {
    0: 1,  # Neutral -> Neutral (1)
    1: 2,  # Positive -> Positive (2)
    2: 0   # Negative -> Negative (0)
}

data2['label'] = data2['sentiment'].map(label_mapping2)
data2 = data2.rename(columns={'tweet': 'sentence'})
data2 = data2[['sentence', 'label']]  # 필요한 열만 선택

# 데이터 결합
combined_data = pd.concat([data1[['sentence', 'label']], data2], ignore_index=True)

# 데이터 분할
train_data, temp_data = train_test_split(
    combined_data, test_size=0.2, stratify=combined_data['label'], random_state=42)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

# 데이터 전처리 함수
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower().strip()
    return text

# 전처리 적용
train_data['sentence'] = train_data['sentence'].apply(preprocess_text)
val_data['sentence'] = val_data['sentence'].apply(preprocess_text)
test_data['sentence'] = test_data['sentence'].apply(preprocess_text)

# 어휘 사전 구축
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for sentence in data_iter:
        yield tokenizer(str(sentence))

vocab = build_vocab_from_iterator(
    yield_tokens(train_data['sentence']),
    max_tokens=MAX_VOCAB_SIZE,
    specials=['<pad>', '<unk>']
)
vocab.set_default_index(vocab['<unk>'])

# 사전 학습된 GloVe-Twitter 임베딩 로드
glove = GloVe(name='twitter.27B', dim=EMBEDDING_DIM)

# 임베딩 매트릭스 생성
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove[token]
    else:
        embedding_matrix[i] = torch.randn(EMBEDDING_DIM)

# 데이터셋 클래스 정의
class FinancialDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data.reset_index(drop=True)
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.loc[idx, 'sentence'])
        label = self.data.loc[idx, 'label']
        tokens = self.tokenizer(sentence)
        token_ids = [self.vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [self.vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return text_tensor, label_tensor

# 데이터셋 및 데이터 로더 생성
train_dataset = FinancialDataset(train_data, vocab, tokenizer)
val_dataset = FinancialDataset(val_data, vocab, tokenizer)
test_dataset = FinancialDataset(test_data, vocab, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 클래스 가중치 계산
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_data['label']),
    y=train_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# 모델 정의
class CNNTransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
        super(CNNTransformerModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
        )
        self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

        # 단일 CNN 인코더
        self.cnn = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=3, padding=1)
        self.layer_norm_cnn = nn.LayerNorm(embedding_dim)

        # 트랜스포머 인코더 레이어
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=10, dropout=0.1, activation='relu', batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=1, norm=nn.LayerNorm(embedding_dim)
        )

        # 출력 레이어
        self.dropout = nn.Dropout(0.6)
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        batch_size, seq_len = x.size()
        x = self.embedding(x)
        positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
        x = x + self.position_embedding(positions)
        x = x.permute(0, 2, 1)

        # CNN 인코더
        x = self.cnn(x)
        x = x.permute(0, 2, 1)
        x = self.layer_norm_cnn(x)
        x = nn.ReLU()(x)

        # 패딩 마스크 생성
        src_key_padding_mask = (x.abs().sum(dim=2) == 0)

        # 트랜스포머 인코더
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

        # 글로벌 평균 풀링
        x = x.mean(dim=1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

# 모델 초기화
model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# 조기 종료 설정
early_stopping_patience = 3
best_val_loss = float('inf')
patience_counter = 0

# 학습 루프
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for texts, labels in progress_bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}")

    # 검증 손실 및 정확도 계산
    val_loss = 0
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_loss /= len(val_loader)
    val_accuracy = correct / total
    print(f"Validation Loss after Epoch {epoch+1}: {val_loss:.4f}")
    print(f"Validation Accuracy after Epoch {epoch+1}: {val_accuracy * 100:.2f}%\n")

    # 조기 종료 체크
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # 모델 저장
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break

# 저장된 모델 로드
model.load_state_dict(torch.load('best_model.pth'))

# 테스트 데이터로 평가
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 분류 보고서 출력
print("\nClassification Report:")
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(all_labels, all_preds, target_names=target_names))

# 예측 함수 정의 및 테스트
def predict(text):
    model.eval()
    with torch.no_grad():
        text = preprocess_text(text)
        tokens = tokenizer(text)
        token_ids = [vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(text_tensor)
        probabilities = nn.functional.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        class_names = ['Negative', 'Neutral', 'Positive']
        return class_names[predicted_class], probabilities.squeeze().cpu().numpy()

# 예시 문장 예측
sample_text = "The company's profits have increased significantly this quarter."
label, probs = predict(sample_text)
print(f"\nSentence: {sample_text}")
print(f"Predicted Sentiment: {label}")
print(f"Probabilities: Negative {probs[0]*100:.2f}%, Neutral {probs[1]*100:.2f}%, Positive {probs[2]*100:.2f}%")


Epoch 1/20: 100%|██████████| 1009/1009 [00:20<00:00, 48.22it/s, loss=0.997]
  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


Epoch [1/20], Average Loss: 1.0850
Validation Loss after Epoch 1: 0.9682
Validation Accuracy after Epoch 1: 59.08%



Epoch 2/20: 100%|██████████| 1009/1009 [00:20<00:00, 49.12it/s, loss=0.839]


Epoch [2/20], Average Loss: 0.9152
Validation Loss after Epoch 2: 0.8515
Validation Accuracy after Epoch 2: 67.78%



Epoch 3/20: 100%|██████████| 1009/1009 [00:20<00:00, 48.93it/s, loss=0.745]


Epoch [3/20], Average Loss: 0.8163
Validation Loss after Epoch 3: 0.8126
Validation Accuracy after Epoch 3: 69.67%



Epoch 4/20: 100%|██████████| 1009/1009 [00:20<00:00, 48.67it/s, loss=0.866]


Epoch [4/20], Average Loss: 0.7654
Validation Loss after Epoch 4: 0.7879
Validation Accuracy after Epoch 4: 72.47%



Epoch 5/20: 100%|██████████| 1009/1009 [00:20<00:00, 48.73it/s, loss=0.882]


Epoch [5/20], Average Loss: 0.7238
Validation Loss after Epoch 5: 0.8282
Validation Accuracy after Epoch 5: 71.08%



Epoch 6/20: 100%|██████████| 1009/1009 [00:20<00:00, 49.12it/s, loss=0.462]


Epoch [6/20], Average Loss: 0.6884
Validation Loss after Epoch 6: 0.7920
Validation Accuracy after Epoch 6: 71.70%



Epoch 7/20: 100%|██████████| 1009/1009 [00:20<00:00, 49.13it/s, loss=0.752]


Epoch [7/20], Average Loss: 0.6595
Validation Loss after Epoch 7: 0.7701
Validation Accuracy after Epoch 7: 73.56%



Epoch 8/20: 100%|██████████| 1009/1009 [00:20<00:00, 49.15it/s, loss=0.675]


Epoch [8/20], Average Loss: 0.6315
Validation Loss after Epoch 8: 0.8044
Validation Accuracy after Epoch 8: 74.18%



Epoch 9/20: 100%|██████████| 1009/1009 [00:20<00:00, 49.03it/s, loss=0.463]


Epoch [9/20], Average Loss: 0.6058
Validation Loss after Epoch 9: 0.7979
Validation Accuracy after Epoch 9: 73.18%



Epoch 10/20: 100%|██████████| 1009/1009 [00:20<00:00, 49.10it/s, loss=0.682]


Epoch [10/20], Average Loss: 0.5820
Validation Loss after Epoch 10: 0.8357
Validation Accuracy after Epoch 10: 74.42%

Early stopping triggered.
Test Accuracy: 73.04%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.59      0.74      0.66       885
     Neutral       0.76      0.73      0.75      1357
    Positive       0.80      0.72      0.76      1794

    accuracy                           0.73      4036
   macro avg       0.72      0.73      0.72      4036
weighted avg       0.74      0.73      0.73      4036


Sentence: The company's profits have increased significantly this quarter.
Predicted Sentiment: Positive
Probabilities: Negative 10.43%, Neutral 7.26%, Positive 82.31%


In [3]:
#세가지 모델 테스트 중 hybrid

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, build_vocab_from_iterator
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
import re

# 하이퍼파라미터 설정
BATCH_SIZE = 32
EMBEDDING_DIM = 200  # GloVe-Twitter 임베딩 차원
NUM_CLASSES = 3
NUM_EPOCHS = 20
LEARNING_RATE = 1e-4
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 모델 타입 선택: 'hybrid', 'cnn', 'transformer'
model_type = 'hybrid'  # 'cnn' 또는 'transformer'로 변경하여 모델 변경

# 데이터셋 로드
dataset1 = load_dataset('financial_phrasebank', 'sentences_allagree')
dataset2 = load_dataset('TimKoornstra/financial-tweets-sentiment')

# 데이터프레임으로 변환
data1 = dataset1['train'].to_pandas()
data2 = dataset2['train'].to_pandas()

# dataset2의 레이블 매핑
label_mapping2 = {
    0: 1,  # Neutral -> Neutral (1)
    1: 2,  # Positive -> Positive (2)
    2: 0   # Negative -> Negative (0)
}

data2['label'] = data2['sentiment'].map(label_mapping2)
data2 = data2.rename(columns={'tweet': 'sentence'})
data2 = data2[['sentence', 'label']]  # 필요한 열만 선택

# 데이터 결합
combined_data = pd.concat([data1[['sentence', 'label']], data2], ignore_index=True)

# 데이터 분할
train_data, temp_data = train_test_split(
    combined_data, test_size=0.2, stratify=combined_data['label'], random_state=42)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

# 데이터 전처리 함수
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower().strip()
    return text

# 전처리 적용
train_data['sentence'] = train_data['sentence'].apply(preprocess_text)
val_data['sentence'] = val_data['sentence'].apply(preprocess_text)
test_data['sentence'] = test_data['sentence'].apply(preprocess_text)

# 어휘 사전 구축
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for sentence in data_iter:
        yield tokenizer(str(sentence))

vocab = build_vocab_from_iterator(
    yield_tokens(train_data['sentence']),
    max_tokens=MAX_VOCAB_SIZE,
    specials=['<pad>', '<unk>']
)
vocab.set_default_index(vocab['<unk>'])

# 사전 학습된 GloVe-Twitter 임베딩 로드
glove = GloVe(name='twitter.27B', dim=EMBEDDING_DIM)

# 임베딩 매트릭스 생성
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove[token]
    else:
        embedding_matrix[i] = torch.randn(EMBEDDING_DIM)

# 데이터셋 클래스 정의
class FinancialDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data.reset_index(drop=True)
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.loc[idx, 'sentence'])
        label = self.data.loc[idx, 'label']
        tokens = self.tokenizer(sentence)
        token_ids = [self.vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [self.vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return text_tensor, label_tensor

# 데이터셋 및 데이터 로더 생성
train_dataset = FinancialDataset(train_data, vocab, tokenizer)
val_dataset = FinancialDataset(val_data, vocab, tokenizer)
test_dataset = FinancialDataset(test_data, vocab, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 클래스 가중치 계산
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_data['label']),
    y=train_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# 모델 정의
if model_type == 'hybrid':
    class CNNTransformerModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(CNNTransformerModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )
            self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

            # CNN 인코더
            self.cnn = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=3, padding=1)
            self.layer_norm_cnn = nn.LayerNorm(embedding_dim)

            # 트랜스포머 인코더 레이어
            encoder_layer = nn.TransformerEncoderLayer(
                d_model=embedding_dim, nhead=10, dropout=0.1, activation='relu', batch_first=True
            )
            self.transformer_encoder = nn.TransformerEncoder(
                encoder_layer, num_layers=1, norm=nn.LayerNorm(embedding_dim)
            )

            # 출력 레이어
            self.dropout = nn.Dropout(0.6)
            self.fc = nn.Linear(embedding_dim, num_classes)

        def forward(self, x):
            batch_size, seq_len = x.size()
            x = self.embedding(x)
            positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
            x = x + self.position_embedding(positions)
            x = x.permute(0, 2, 1)

            # CNN 인코더
            x = self.cnn(x)
            x = x.permute(0, 2, 1)
            x = self.layer_norm_cnn(x)
            x = nn.ReLU()(x)

            # 패딩 마스크 생성
            src_key_padding_mask = (x.abs().sum(dim=2) == 0)

            # 트랜스포머 인코더
            x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

            # 글로벌 평균 풀링
            x = x.mean(dim=1)
            x = self.dropout(x)
            logits = self.fc(x)
            return logits
    model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

elif model_type == 'cnn':
    class CNNModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(CNNModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )

            # 여러 개의 CNN 레이어를 사용하여 다양한 커널 크기 적용
            self.convs = nn.ModuleList([
                nn.Conv1d(embedding_dim, 128, kernel_size=3, padding=1),
                nn.Conv1d(embedding_dim, 128, kernel_size=4, padding=2),
                nn.Conv1d(embedding_dim, 128, kernel_size=5, padding=2)
            ])
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Linear(128 * len(self.convs), num_classes)

        def forward(self, x):
            x = self.embedding(x)  # [batch_size, seq_len, embedding_dim]
            x = x.permute(0, 2, 1)  # [batch_size, embedding_dim, seq_len]
            x = [nn.ReLU()(conv(x)) for conv in self.convs]
            x = [nn.functional.max_pool1d(feature_map, kernel_size=feature_map.shape[2]).squeeze(2) for feature_map in x]
            x = torch.cat(x, dim=1)
            x = self.dropout(x)
            logits = self.fc(x)
            return logits
    model = CNNModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

elif model_type == 'transformer':
    class TransformerModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(TransformerModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )
            self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

            # 트랜스포머 인코더 레이어
            encoder_layer = nn.TransformerEncoderLayer(
                d_model=embedding_dim, nhead=8, dropout=0.1, activation='relu', batch_first=True
            )
            self.transformer_encoder = nn.TransformerEncoder(
                encoder_layer, num_layers=2, norm=nn.LayerNorm(embedding_dim)
            )

            # 출력 레이어
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Linear(embedding_dim, num_classes)

        def forward(self, x):
            batch_size, seq_len = x.size()
            x = self.embedding(x)
            positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
            x = x + self.position_embedding(positions)

            # 패딩 마스크 생성
            src_key_padding_mask = (x.abs().sum(dim=2) == 0)

            # 트랜스포머 인코더
            x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

            # 글로벌 평균 풀링
            x = x.mean(dim=1)
            x = self.dropout(x)
            logits = self.fc(x)
            return logits
    model = TransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# 조기 종료 설정
early_stopping_patience = 3
best_val_loss = float('inf')
patience_counter = 0

# 학습 루프
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for texts, labels in progress_bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}")

    # 검증 손실 및 정확도 계산
    val_loss = 0
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_loss /= len(val_loader)
    val_accuracy = correct / total
    print(f"Validation Loss after Epoch {epoch+1}: {val_loss:.4f}")
    print(f"Validation Accuracy after Epoch {epoch+1}: {val_accuracy * 100:.2f}%\n")

    # 조기 종료 체크
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # 모델 저장
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break

# 저장된 모델 로드
model.load_state_dict(torch.load('best_model.pth'))

# 테스트 데이터로 평가
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 분류 보고서 출력
print("\nClassification Report:")
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(all_labels, all_preds, target_names=target_names))

# 예측 함수 정의 및 테스트
def predict(text):
    model.eval()
    with torch.no_grad():
        text = preprocess_text(text)
        tokens = tokenizer(text)
        token_ids = [vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(text_tensor)
        probabilities = nn.functional.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        class_names = ['Negative', 'Neutral', 'Positive']
        return class_names[predicted_class], probabilities.squeeze().cpu().numpy()

# 예시 문장 예측
sample_text = "The company's profits have increased significantly this quarter."
label, probs = predict(sample_text)
print(f"\nSentence: {sample_text}")
print(f"Predicted Sentiment: {label}")
print(f"Probabilities: Negative {probs[0]*100:.2f}%, Neutral {probs[1]*100:.2f}%, Positive {probs[2]*100:.2f}%")


Epoch 1/20: 100%|██████████| 1009/1009 [00:22<00:00, 45.20it/s, loss=0.972]


Epoch [1/20], Average Loss: 1.0969
Validation Loss after Epoch 1: 1.0110
Validation Accuracy after Epoch 1: 48.48%



Epoch 2/20: 100%|██████████| 1009/1009 [00:22<00:00, 45.67it/s, loss=0.937]


Epoch [2/20], Average Loss: 0.9313
Validation Loss after Epoch 2: 0.8735
Validation Accuracy after Epoch 2: 60.62%



Epoch 3/20: 100%|██████████| 1009/1009 [00:21<00:00, 47.09it/s, loss=0.909]


Epoch [3/20], Average Loss: 0.8333
Validation Loss after Epoch 3: 0.8148
Validation Accuracy after Epoch 3: 68.35%



Epoch 4/20: 100%|██████████| 1009/1009 [00:21<00:00, 47.22it/s, loss=0.725]


Epoch [4/20], Average Loss: 0.7685
Validation Loss after Epoch 4: 0.7945
Validation Accuracy after Epoch 4: 71.57%



Epoch 5/20: 100%|██████████| 1009/1009 [00:21<00:00, 47.41it/s, loss=0.727]


Epoch [5/20], Average Loss: 0.7301
Validation Loss after Epoch 5: 0.7904
Validation Accuracy after Epoch 5: 71.67%



Epoch 6/20: 100%|██████████| 1009/1009 [00:21<00:00, 47.28it/s, loss=0.832]


Epoch [6/20], Average Loss: 0.6904
Validation Loss after Epoch 6: 0.8062
Validation Accuracy after Epoch 6: 73.21%



Epoch 7/20: 100%|██████████| 1009/1009 [00:21<00:00, 47.37it/s, loss=0.638]


Epoch [7/20], Average Loss: 0.6606
Validation Loss after Epoch 7: 0.7906
Validation Accuracy after Epoch 7: 70.66%



Epoch 8/20: 100%|██████████| 1009/1009 [00:21<00:00, 47.30it/s, loss=0.745]


Epoch [8/20], Average Loss: 0.6315
Validation Loss after Epoch 8: 0.7847
Validation Accuracy after Epoch 8: 74.47%



Epoch 9/20: 100%|██████████| 1009/1009 [00:21<00:00, 47.52it/s, loss=0.72]


Epoch [9/20], Average Loss: 0.6061
Validation Loss after Epoch 9: 0.7906
Validation Accuracy after Epoch 9: 72.02%



Epoch 10/20: 100%|██████████| 1009/1009 [00:21<00:00, 47.32it/s, loss=0.651]


Epoch [10/20], Average Loss: 0.5807
Validation Loss after Epoch 10: 0.8173
Validation Accuracy after Epoch 10: 72.74%



Epoch 11/20: 100%|██████████| 1009/1009 [00:21<00:00, 47.58it/s, loss=0.375]


Epoch [11/20], Average Loss: 0.5572
Validation Loss after Epoch 11: 0.8161
Validation Accuracy after Epoch 11: 73.98%

Early stopping triggered.
Test Accuracy: 74.28%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.64      0.65       885
     Neutral       0.74      0.77      0.76      1357
    Positive       0.78      0.77      0.78      1794

    accuracy                           0.74      4036
   macro avg       0.73      0.73      0.73      4036
weighted avg       0.74      0.74      0.74      4036


Sentence: The company's profits have increased significantly this quarter.
Predicted Sentiment: Positive
Probabilities: Negative 16.78%, Neutral 5.53%, Positive 77.70%


In [4]:
#세가지 모델 테스트 중 CNN

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, build_vocab_from_iterator
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
import re

# 하이퍼파라미터 설정
BATCH_SIZE = 32
EMBEDDING_DIM = 200  # GloVe-Twitter 임베딩 차원
NUM_CLASSES = 3
NUM_EPOCHS = 20
LEARNING_RATE = 1e-4
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 모델 타입 선택: 'hybrid', 'cnn', 'transformer'
model_type = 'cnn'  # 'cnn' 또는 'transformer'로 변경하여 모델 변경

# 데이터셋 로드
dataset1 = load_dataset('financial_phrasebank', 'sentences_allagree')
dataset2 = load_dataset('TimKoornstra/financial-tweets-sentiment')

# 데이터프레임으로 변환
data1 = dataset1['train'].to_pandas()
data2 = dataset2['train'].to_pandas()

# dataset2의 레이블 매핑
label_mapping2 = {
    0: 1,  # Neutral -> Neutral (1)
    1: 2,  # Positive -> Positive (2)
    2: 0   # Negative -> Negative (0)
}

data2['label'] = data2['sentiment'].map(label_mapping2)
data2 = data2.rename(columns={'tweet': 'sentence'})
data2 = data2[['sentence', 'label']]  # 필요한 열만 선택

# 데이터 결합
combined_data = pd.concat([data1[['sentence', 'label']], data2], ignore_index=True)

# 데이터 분할
train_data, temp_data = train_test_split(
    combined_data, test_size=0.2, stratify=combined_data['label'], random_state=42)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

# 데이터 전처리 함수
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower().strip()
    return text

# 전처리 적용
train_data['sentence'] = train_data['sentence'].apply(preprocess_text)
val_data['sentence'] = val_data['sentence'].apply(preprocess_text)
test_data['sentence'] = test_data['sentence'].apply(preprocess_text)

# 어휘 사전 구축
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for sentence in data_iter:
        yield tokenizer(str(sentence))

vocab = build_vocab_from_iterator(
    yield_tokens(train_data['sentence']),
    max_tokens=MAX_VOCAB_SIZE,
    specials=['<pad>', '<unk>']
)
vocab.set_default_index(vocab['<unk>'])

# 사전 학습된 GloVe-Twitter 임베딩 로드
glove = GloVe(name='twitter.27B', dim=EMBEDDING_DIM)

# 임베딩 매트릭스 생성
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove[token]
    else:
        embedding_matrix[i] = torch.randn(EMBEDDING_DIM)

# 데이터셋 클래스 정의
class FinancialDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data.reset_index(drop=True)
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.loc[idx, 'sentence'])
        label = self.data.loc[idx, 'label']
        tokens = self.tokenizer(sentence)
        token_ids = [self.vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [self.vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return text_tensor, label_tensor

# 데이터셋 및 데이터 로더 생성
train_dataset = FinancialDataset(train_data, vocab, tokenizer)
val_dataset = FinancialDataset(val_data, vocab, tokenizer)
test_dataset = FinancialDataset(test_data, vocab, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 클래스 가중치 계산
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_data['label']),
    y=train_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# 모델 정의
if model_type == 'hybrid':
    class CNNTransformerModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(CNNTransformerModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )
            self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

            # CNN 인코더
            self.cnn = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=3, padding=1)
            self.layer_norm_cnn = nn.LayerNorm(embedding_dim)

            # 트랜스포머 인코더 레이어
            encoder_layer = nn.TransformerEncoderLayer(
                d_model=embedding_dim, nhead=10, dropout=0.1, activation='relu', batch_first=True
            )
            self.transformer_encoder = nn.TransformerEncoder(
                encoder_layer, num_layers=1, norm=nn.LayerNorm(embedding_dim)
            )

            # 출력 레이어
            self.dropout = nn.Dropout(0.6)
            self.fc = nn.Linear(embedding_dim, num_classes)

        def forward(self, x):
            batch_size, seq_len = x.size()
            x = self.embedding(x)
            positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
            x = x + self.position_embedding(positions)
            x = x.permute(0, 2, 1)

            # CNN 인코더
            x = self.cnn(x)
            x = x.permute(0, 2, 1)
            x = self.layer_norm_cnn(x)
            x = nn.ReLU()(x)

            # 패딩 마스크 생성
            src_key_padding_mask = (x.abs().sum(dim=2) == 0)

            # 트랜스포머 인코더
            x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

            # 글로벌 평균 풀링
            x = x.mean(dim=1)
            x = self.dropout(x)
            logits = self.fc(x)
            return logits
    model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

elif model_type == 'cnn':
    class CNNModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(CNNModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )

            # 여러 개의 CNN 레이어를 사용하여 다양한 커널 크기 적용
            self.convs = nn.ModuleList([
                nn.Conv1d(embedding_dim, 128, kernel_size=3, padding=1),
                nn.Conv1d(embedding_dim, 128, kernel_size=4, padding=2),
                nn.Conv1d(embedding_dim, 128, kernel_size=5, padding=2)
            ])
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Linear(128 * len(self.convs), num_classes)

        def forward(self, x):
            x = self.embedding(x)  # [batch_size, seq_len, embedding_dim]
            x = x.permute(0, 2, 1)  # [batch_size, embedding_dim, seq_len]
            x = [nn.ReLU()(conv(x)) for conv in self.convs]
            x = [nn.functional.max_pool1d(feature_map, kernel_size=feature_map.shape[2]).squeeze(2) for feature_map in x]
            x = torch.cat(x, dim=1)
            x = self.dropout(x)
            logits = self.fc(x)
            return logits
    model = CNNModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

elif model_type == 'transformer':
    class TransformerModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(TransformerModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )
            self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

            # 트랜스포머 인코더 레이어
            encoder_layer = nn.TransformerEncoderLayer(
                d_model=embedding_dim, nhead=8, dropout=0.1, activation='relu', batch_first=True
            )
            self.transformer_encoder = nn.TransformerEncoder(
                encoder_layer, num_layers=2, norm=nn.LayerNorm(embedding_dim)
            )

            # 출력 레이어
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Linear(embedding_dim, num_classes)

        def forward(self, x):
            batch_size, seq_len = x.size()
            x = self.embedding(x)
            positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
            x = x + self.position_embedding(positions)

            # 패딩 마스크 생성
            src_key_padding_mask = (x.abs().sum(dim=2) == 0)

            # 트랜스포머 인코더
            x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

            # 글로벌 평균 풀링
            x = x.mean(dim=1)
            x = self.dropout(x)
            logits = self.fc(x)
            return logits
    model = TransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# 조기 종료 설정
early_stopping_patience = 3
best_val_loss = float('inf')
patience_counter = 0

# 학습 루프
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for texts, labels in progress_bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}")

    # 검증 손실 및 정확도 계산
    val_loss = 0
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_loss /= len(val_loader)
    val_accuracy = correct / total
    print(f"Validation Loss after Epoch {epoch+1}: {val_loss:.4f}")
    print(f"Validation Accuracy after Epoch {epoch+1}: {val_accuracy * 100:.2f}%\n")

    # 조기 종료 체크
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # 모델 저장
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break

# 저장된 모델 로드
model.load_state_dict(torch.load('best_model.pth'))

# 테스트 데이터로 평가
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 분류 보고서 출력
print("\nClassification Report:")
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(all_labels, all_preds, target_names=target_names))

# 예측 함수 정의 및 테스트
def predict(text):
    model.eval()
    with torch.no_grad():
        text = preprocess_text(text)
        tokens = tokenizer(text)
        token_ids = [vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(text_tensor)
        probabilities = nn.functional.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        class_names = ['Negative', 'Neutral', 'Positive']
        return class_names[predicted_class], probabilities.squeeze().cpu().numpy()

# 예시 문장 예측
sample_text = "The company's profits have increased significantly this quarter."
label, probs = predict(sample_text)
print(f"\nSentence: {sample_text}")
print(f"Predicted Sentiment: {label}")
print(f"Probabilities: Negative {probs[0]*100:.2f}%, Neutral {probs[1]*100:.2f}%, Positive {probs[2]*100:.2f}%")


Epoch 1/20: 100%|██████████| 1009/1009 [00:09<00:00, 101.71it/s, loss=0.946]


Epoch [1/20], Average Loss: 1.0159
Validation Loss after Epoch 1: 0.9316
Validation Accuracy after Epoch 1: 61.14%



Epoch 2/20: 100%|██████████| 1009/1009 [00:09<00:00, 102.61it/s, loss=0.859]


Epoch [2/20], Average Loss: 0.8995
Validation Loss after Epoch 2: 0.8675
Validation Accuracy after Epoch 2: 66.86%



Epoch 3/20: 100%|██████████| 1009/1009 [00:09<00:00, 103.65it/s, loss=0.679]


Epoch [3/20], Average Loss: 0.8239
Validation Loss after Epoch 3: 0.8217
Validation Accuracy after Epoch 3: 69.02%



Epoch 4/20: 100%|██████████| 1009/1009 [00:09<00:00, 103.76it/s, loss=0.806]


Epoch [4/20], Average Loss: 0.7672
Validation Loss after Epoch 4: 0.7944
Validation Accuracy after Epoch 4: 70.33%



Epoch 5/20: 100%|██████████| 1009/1009 [00:09<00:00, 103.75it/s, loss=0.703]


Epoch [5/20], Average Loss: 0.7227
Validation Loss after Epoch 5: 0.7834
Validation Accuracy after Epoch 5: 72.89%



Epoch 6/20: 100%|██████████| 1009/1009 [00:09<00:00, 103.34it/s, loss=0.557]


Epoch [6/20], Average Loss: 0.6880
Validation Loss after Epoch 6: 0.7730
Validation Accuracy after Epoch 6: 72.99%



Epoch 7/20: 100%|██████████| 1009/1009 [00:09<00:00, 103.18it/s, loss=0.683]


Epoch [7/20], Average Loss: 0.6571
Validation Loss after Epoch 7: 0.7632
Validation Accuracy after Epoch 7: 73.71%



Epoch 8/20: 100%|██████████| 1009/1009 [00:09<00:00, 102.82it/s, loss=0.664]


Epoch [8/20], Average Loss: 0.6293
Validation Loss after Epoch 8: 0.7565
Validation Accuracy after Epoch 8: 74.40%



Epoch 9/20: 100%|██████████| 1009/1009 [00:09<00:00, 103.26it/s, loss=0.575]


Epoch [9/20], Average Loss: 0.6056
Validation Loss after Epoch 9: 0.7617
Validation Accuracy after Epoch 9: 75.04%



Epoch 10/20: 100%|██████████| 1009/1009 [00:09<00:00, 103.35it/s, loss=0.634]


Epoch [10/20], Average Loss: 0.5851
Validation Loss after Epoch 10: 0.7551
Validation Accuracy after Epoch 10: 74.32%



Epoch 11/20: 100%|██████████| 1009/1009 [00:09<00:00, 103.38it/s, loss=0.494]


Epoch [11/20], Average Loss: 0.5645
Validation Loss after Epoch 11: 0.7595
Validation Accuracy after Epoch 11: 75.14%



Epoch 12/20: 100%|██████████| 1009/1009 [00:09<00:00, 103.72it/s, loss=0.647]


Epoch [12/20], Average Loss: 0.5495
Validation Loss after Epoch 12: 0.7585
Validation Accuracy after Epoch 12: 75.14%



Epoch 13/20: 100%|██████████| 1009/1009 [00:09<00:00, 102.59it/s, loss=0.575]


Epoch [13/20], Average Loss: 0.5344
Validation Loss after Epoch 13: 0.7609
Validation Accuracy after Epoch 13: 74.92%

Early stopping triggered.
Test Accuracy: 75.30%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.64      0.76      0.69       885
     Neutral       0.75      0.80      0.77      1357
    Positive       0.84      0.71      0.77      1794

    accuracy                           0.75      4036
   macro avg       0.74      0.76      0.75      4036
weighted avg       0.76      0.75      0.75      4036


Sentence: The company's profits have increased significantly this quarter.
Predicted Sentiment: Positive
Probabilities: Negative 35.95%, Neutral 7.24%, Positive 56.80%


In [5]:
#세가지 모델 테스트 중 Transformer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, build_vocab_from_iterator
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
import re

# 하이퍼파라미터 설정
BATCH_SIZE = 32
EMBEDDING_DIM = 200  # GloVe-Twitter 임베딩 차원
NUM_CLASSES = 3
NUM_EPOCHS = 20
LEARNING_RATE = 1e-4
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 모델 타입 선택: 'hybrid', 'cnn', 'transformer'
model_type = 'transformer'  # 'cnn' 또는 'transformer'로 변경하여 모델 변경

# 데이터셋 로드
dataset1 = load_dataset('financial_phrasebank', 'sentences_allagree')
dataset2 = load_dataset('TimKoornstra/financial-tweets-sentiment')

# 데이터프레임으로 변환
data1 = dataset1['train'].to_pandas()
data2 = dataset2['train'].to_pandas()

# dataset2의 레이블 매핑
label_mapping2 = {
    0: 1,  # Neutral -> Neutral (1)
    1: 2,  # Positive -> Positive (2)
    2: 0   # Negative -> Negative (0)
}

data2['label'] = data2['sentiment'].map(label_mapping2)
data2 = data2.rename(columns={'tweet': 'sentence'})
data2 = data2[['sentence', 'label']]  # 필요한 열만 선택

# 데이터 결합
combined_data = pd.concat([data1[['sentence', 'label']], data2], ignore_index=True)

# 데이터 분할
train_data, temp_data = train_test_split(
    combined_data, test_size=0.2, stratify=combined_data['label'], random_state=42)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

# 데이터 전처리 함수
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower().strip()
    return text

# 전처리 적용
train_data['sentence'] = train_data['sentence'].apply(preprocess_text)
val_data['sentence'] = val_data['sentence'].apply(preprocess_text)
test_data['sentence'] = test_data['sentence'].apply(preprocess_text)

# 어휘 사전 구축
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for sentence in data_iter:
        yield tokenizer(str(sentence))

vocab = build_vocab_from_iterator(
    yield_tokens(train_data['sentence']),
    max_tokens=MAX_VOCAB_SIZE,
    specials=['<pad>', '<unk>']
)
vocab.set_default_index(vocab['<unk>'])

# 사전 학습된 GloVe-Twitter 임베딩 로드
glove = GloVe(name='twitter.27B', dim=EMBEDDING_DIM)

# 임베딩 매트릭스 생성
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove[token]
    else:
        embedding_matrix[i] = torch.randn(EMBEDDING_DIM)

# 데이터셋 클래스 정의
class FinancialDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data.reset_index(drop=True)
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.loc[idx, 'sentence'])
        label = self.data.loc[idx, 'label']
        tokens = self.tokenizer(sentence)
        token_ids = [self.vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [self.vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return text_tensor, label_tensor

# 데이터셋 및 데이터 로더 생성
train_dataset = FinancialDataset(train_data, vocab, tokenizer)
val_dataset = FinancialDataset(val_data, vocab, tokenizer)
test_dataset = FinancialDataset(test_data, vocab, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 클래스 가중치 계산
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_data['label']),
    y=train_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# 모델 정의
if model_type == 'hybrid':
    class CNNTransformerModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(CNNTransformerModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )
            self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

            # CNN 인코더
            self.cnn = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=3, padding=1)
            self.layer_norm_cnn = nn.LayerNorm(embedding_dim)

            # 트랜스포머 인코더 레이어
            encoder_layer = nn.TransformerEncoderLayer(
                d_model=embedding_dim, nhead=10, dropout=0.1, activation='relu', batch_first=True
            )
            self.transformer_encoder = nn.TransformerEncoder(
                encoder_layer, num_layers=1, norm=nn.LayerNorm(embedding_dim)
            )

            # 출력 레이어
            self.dropout = nn.Dropout(0.6)
            self.fc = nn.Linear(embedding_dim, num_classes)

        def forward(self, x):
            batch_size, seq_len = x.size()
            x = self.embedding(x)
            positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
            x = x + self.position_embedding(positions)
            x = x.permute(0, 2, 1)

            # CNN 인코더
            x = self.cnn(x)
            x = x.permute(0, 2, 1)
            x = self.layer_norm_cnn(x)
            x = nn.ReLU()(x)

            # 패딩 마스크 생성
            src_key_padding_mask = (x.abs().sum(dim=2) == 0)

            # 트랜스포머 인코더
            x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

            # 글로벌 평균 풀링
            x = x.mean(dim=1)
            x = self.dropout(x)
            logits = self.fc(x)
            return logits
    model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

elif model_type == 'cnn':
    class CNNModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(CNNModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )

            # 여러 개의 CNN 레이어를 사용하여 다양한 커널 크기 적용
            self.convs = nn.ModuleList([
                nn.Conv1d(embedding_dim, 128, kernel_size=3, padding=1),
                nn.Conv1d(embedding_dim, 128, kernel_size=4, padding=2),
                nn.Conv1d(embedding_dim, 128, kernel_size=5, padding=2)
            ])
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Linear(128 * len(self.convs), num_classes)

        def forward(self, x):
            x = self.embedding(x)  # [batch_size, seq_len, embedding_dim]
            x = x.permute(0, 2, 1)  # [batch_size, embedding_dim, seq_len]
            x = [nn.ReLU()(conv(x)) for conv in self.convs]
            x = [nn.functional.max_pool1d(feature_map, kernel_size=feature_map.shape[2]).squeeze(2) for feature_map in x]
            x = torch.cat(x, dim=1)
            x = self.dropout(x)
            logits = self.fc(x)
            return logits
    model = CNNModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

elif model_type == 'transformer':
    class TransformerModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix):
            super(TransformerModel, self).__init__()
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, freeze=False, padding_idx=vocab['<pad>']
            )
            self.position_embedding = nn.Embedding(MAX_SEQ_LEN, embedding_dim)

            # 트랜스포머 인코더 레이어
            encoder_layer = nn.TransformerEncoderLayer(
                d_model=embedding_dim, nhead=8, dropout=0.1, activation='relu', batch_first=True
            )
            self.transformer_encoder = nn.TransformerEncoder(
                encoder_layer, num_layers=2, norm=nn.LayerNorm(embedding_dim)
            )

            # 출력 레이어
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Linear(embedding_dim, num_classes)

        def forward(self, x):
            batch_size, seq_len = x.size()
            x = self.embedding(x)
            positions = torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len).to(device)
            x = x + self.position_embedding(positions)

            # 패딩 마스크 생성
            src_key_padding_mask = (x.abs().sum(dim=2) == 0)

            # 트랜스포머 인코더
            x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

            # 글로벌 평균 풀링
            x = x.mean(dim=1)
            x = self.dropout(x)
            logits = self.fc(x)
            return logits
    model = TransformerModel(len(vocab), EMBEDDING_DIM, NUM_CLASSES, embedding_matrix).to(device)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# 조기 종료 설정
early_stopping_patience = 3
best_val_loss = float('inf')
patience_counter = 0

# 학습 루프
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for texts, labels in progress_bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}")

    # 검증 손실 및 정확도 계산
    val_loss = 0
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_loss /= len(val_loader)
    val_accuracy = correct / total
    print(f"Validation Loss after Epoch {epoch+1}: {val_loss:.4f}")
    print(f"Validation Accuracy after Epoch {epoch+1}: {val_accuracy * 100:.2f}%\n")

    # 조기 종료 체크
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # 모델 저장
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break

# 저장된 모델 로드
model.load_state_dict(torch.load('best_model.pth'))

# 테스트 데이터로 평가
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 분류 보고서 출력
print("\nClassification Report:")
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(all_labels, all_preds, target_names=target_names))

# 예측 함수 정의 및 테스트
def predict(text):
    model.eval()
    with torch.no_grad():
        text = preprocess_text(text)
        tokens = tokenizer(text)
        token_ids = [vocab[token] for token in tokens]
        if len(token_ids) > MAX_SEQ_LEN:
            token_ids = token_ids[:MAX_SEQ_LEN]
        else:
            token_ids += [vocab['<pad>']] * (MAX_SEQ_LEN - len(token_ids))
        text_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(text_tensor)
        probabilities = nn.functional.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        class_names = ['Negative', 'Neutral', 'Positive']
        return class_names[predicted_class], probabilities.squeeze().cpu().numpy()

# 예시 문장 예측
sample_text = "The company's profits have increased significantly this quarter."
label, probs = predict(sample_text)
print(f"\nSentence: {sample_text}")
print(f"Predicted Sentiment: {label}")
print(f"Probabilities: Negative {probs[0]*100:.2f}%, Neutral {probs[1]*100:.2f}%, Positive {probs[2]*100:.2f}%")


Epoch 1/20: 100%|██████████| 1009/1009 [00:41<00:00, 24.49it/s, loss=1.06]


Epoch [1/20], Average Loss: 1.0701
Validation Loss after Epoch 1: 0.9451
Validation Accuracy after Epoch 1: 53.48%



Epoch 2/20: 100%|██████████| 1009/1009 [00:40<00:00, 24.64it/s, loss=0.8]


Epoch [2/20], Average Loss: 0.8941
Validation Loss after Epoch 2: 0.8417
Validation Accuracy after Epoch 2: 66.39%



Epoch 3/20: 100%|██████████| 1009/1009 [00:41<00:00, 24.57it/s, loss=0.66]


Epoch [3/20], Average Loss: 0.8108
Validation Loss after Epoch 3: 0.7975
Validation Accuracy after Epoch 3: 70.04%



Epoch 4/20: 100%|██████████| 1009/1009 [00:40<00:00, 24.64it/s, loss=0.825]


Epoch [4/20], Average Loss: 0.7591
Validation Loss after Epoch 4: 0.7882
Validation Accuracy after Epoch 4: 71.90%



Epoch 5/20: 100%|██████████| 1009/1009 [00:40<00:00, 24.62it/s, loss=0.426]


Epoch [5/20], Average Loss: 0.7218
Validation Loss after Epoch 5: 0.8127
Validation Accuracy after Epoch 5: 71.95%



Epoch 6/20: 100%|██████████| 1009/1009 [00:41<00:00, 24.60it/s, loss=0.606]


Epoch [6/20], Average Loss: 0.6901
Validation Loss after Epoch 6: 0.7842
Validation Accuracy after Epoch 6: 72.89%



Epoch 7/20: 100%|██████████| 1009/1009 [00:41<00:00, 24.59it/s, loss=0.547]


Epoch [7/20], Average Loss: 0.6605
Validation Loss after Epoch 7: 0.7871
Validation Accuracy after Epoch 7: 74.80%



Epoch 8/20: 100%|██████████| 1009/1009 [00:41<00:00, 24.58it/s, loss=0.683]


Epoch [8/20], Average Loss: 0.6318
Validation Loss after Epoch 8: 0.7819
Validation Accuracy after Epoch 8: 72.29%



Epoch 9/20: 100%|██████████| 1009/1009 [00:40<00:00, 24.64it/s, loss=0.73]


Epoch [9/20], Average Loss: 0.6059
Validation Loss after Epoch 9: 0.7748
Validation Accuracy after Epoch 9: 73.36%



Epoch 10/20: 100%|██████████| 1009/1009 [00:40<00:00, 24.64it/s, loss=0.65]


Epoch [10/20], Average Loss: 0.5838
Validation Loss after Epoch 10: 0.8118
Validation Accuracy after Epoch 10: 72.52%



Epoch 11/20: 100%|██████████| 1009/1009 [00:40<00:00, 24.61it/s, loss=0.596]


Epoch [11/20], Average Loss: 0.5580
Validation Loss after Epoch 11: 0.8139
Validation Accuracy after Epoch 11: 73.78%



Epoch 12/20: 100%|██████████| 1009/1009 [00:40<00:00, 24.64it/s, loss=0.7]


Epoch [12/20], Average Loss: 0.5356
Validation Loss after Epoch 12: 0.8526
Validation Accuracy after Epoch 12: 73.38%

Early stopping triggered.
Test Accuracy: 73.07%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.60      0.73      0.66       885
     Neutral       0.73      0.75      0.74      1357
    Positive       0.82      0.71      0.76      1794

    accuracy                           0.73      4036
   macro avg       0.72      0.73      0.72      4036
weighted avg       0.74      0.73      0.73      4036


Sentence: The company's profits have increased significantly this quarter.
Predicted Sentiment: Neutral
Probabilities: Negative 33.67%, Neutral 39.89%, Positive 26.44%
