# Sparse Embedding

In [None]:
!pip install Korpora

Collecting Korpora
  Downloading Korpora-0.2.0-py3-none-any.whl.metadata (26 kB)
Collecting dataclasses>=0.6 (from Korpora)
  Downloading dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Downloading Korpora-0.2.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses-0.6-py3-none-any.whl (14 kB)
Installing collected packages: dataclasses, Korpora
Successfully installed Korpora-0.2.0 dataclasses-0.6


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import tensorflow_datasets as tfds
from sklearn.metrics import f1_score

## TfidfVectorizer

In [None]:
ds_train, ds_test = tfds.load(
    'imdb_reviews',
    split=['train', 'test'],
    as_supervised=True,
    shuffle_files=True
)



[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\wjl01\tensorflow_datasets\imdb_reviews\plain_text\1.0.0...[0m


  from .autonotebook import tqdm as notebook_tqdm
Dl Size...: 100%|██████████| 80/80 [00:17<00:00,  4.66 MiB/s]rl]
Dl Completed...: 100%|██████████| 1/1 [00:17<00:00, 17.17s/ url]

                                                                        

[1mDataset imdb_reviews downloaded and prepared to C:\Users\wjl01\tensorflow_datasets\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
# (텍스트 배열, 레이블 배열) 형태로 바꾸는 함수
def ds_to_xy(ds):
    texts, labels = [], []
    for text, label in tfds.as_numpy(ds):
        texts.append(text.decode('utf-8'))
        labels.append(int(label))
    return np.array(texts), np.array(labels)

X_train_en, y_train_en = ds_to_xy(ds_train)
X_val_en, y_val_en = ds_to_xy(ds_test)

# 1. TF-IDF 벡터 변환기 정의 및 학습
vectorizer = TfidfVectorizer(max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train_en).toarray()
X_val_vec = vectorizer.transform(X_val_en).toarray()

# 2. NumPy → PyTorch Tensor
X_train_tensor = torch.tensor(X_train_vec, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_en, dtype=torch.float32).unsqueeze(1)

X_val_tensor = torch.tensor(X_val_vec, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_en, dtype=torch.float32).unsqueeze(1)

# 3. DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# 4. 모델 정의
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 1)  # binary classification
        )

    def forward(self, x):
        return self.model(x)

model = SentimentClassifier(input_dim=10000)

# 5. 손실 함수 & 옵티마이저
criterion = nn.BCEWithLogitsLoss()  # sigmoid 포함된 버전
optimizer = optim.Adam(model.parameters(), lr=5e-4)

# 6. 학습 루프
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(5):
    # ---------- Training ----------
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).float()
        all_preds.extend(preds.cpu().numpy().astype(int))
        all_labels.extend(y_batch.cpu().numpy().astype(int))

    avg_loss = total_loss / len(train_loader)
    acc = (np.array(all_preds) == np.array(all_labels)).mean()
    f1 = f1_score(all_labels, all_preds)

    print(f"[Train] Epoch {epoch+1} - Loss: {avg_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f}")

    # ---------- Validation ----------
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            val_loss += loss.item()

            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()
            val_preds.extend(preds.cpu().numpy().astype(int))
            val_labels.extend(y_batch.cpu().numpy().astype(int))

    val_avg_loss = val_loss / len(val_loader)
    val_acc = (np.array(val_preds) == np.array(val_labels)).mean()
    val_f1 = f1_score(val_labels, val_preds)

    print(f"[Val]   Epoch {epoch+1} - Loss: {val_avg_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

[Train] Epoch 1 - Loss: 0.4897 | Acc: 0.8464 | F1: 0.8470
[Val]   Epoch 1 - Loss: 0.3314 | Acc: 0.8806 | F1: 0.8815
[Train] Epoch 2 - Loss: 0.2527 | Acc: 0.9091 | F1: 0.9097
[Val]   Epoch 2 - Loss: 0.2766 | Acc: 0.8864 | F1: 0.8840
[Train] Epoch 3 - Loss: 0.1939 | Acc: 0.9303 | F1: 0.9305
[Val]   Epoch 3 - Loss: 0.2689 | Acc: 0.8880 | F1: 0.8877
[Train] Epoch 4 - Loss: 0.1609 | Acc: 0.9430 | F1: 0.9431
[Val]   Epoch 4 - Loss: 0.2828 | Acc: 0.8816 | F1: 0.8781
[Train] Epoch 5 - Loss: 0.1373 | Acc: 0.9534 | F1: 0.9535
[Val]   Epoch 5 - Loss: 0.2969 | Acc: 0.8768 | F1: 0.8737


### 한국어 데이터로 test

영어로 학습된 vectorizer를 한국어 데이터에 transform 하면 대부분 0 벡터

In [None]:
from Korpora import Korpora
from sklearn.metrics import accuracy_score, f1_score

# 1. NSMC 데이터 불러오기
corpus = Korpora.load("nsmc")
X_nsmc = corpus.get_all_texts()
y_nsmc = list(map(int, corpus.get_all_labels()))  # str → int

# 2. TF-IDF 벡터화 (fit 없이 transform만!)
X_nsmc_vec = vectorizer.transform(X_nsmc).toarray()

# 3. 텐서 변환 (CPU에서 처리)
X_nsmc_tensor = torch.tensor(X_nsmc_vec, dtype=torch.float32)
y_nsmc_tensor = torch.tensor(y_nsmc, dtype=torch.float32).unsqueeze(1)

# 4. DataLoader 구성
nsmc_dataset = TensorDataset(X_nsmc_tensor, y_nsmc_tensor)
nsmc_loader = DataLoader(nsmc_dataset, batch_size=64)

# 5. 모델 추론 및 평가
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in nsmc_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        logits = model(X_batch)
        preds = (torch.sigmoid(logits) >= 0.5).float()

        all_preds.extend(preds.cpu().numpy().astype(int))
        all_labels.extend(y_batch.cpu().numpy().astype(int))

# 6. 정확도 및 F1 점수 계산
acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"NSMC Accuracy (영어로 학습된 모델 → 한국어 평가): {acc:.4f} | F1: {f1:.4f}")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/



[nsmc] download ratings_train.txt: 14.6MB [00:00, 35.3MB/s]                            
[nsmc] download ratings_test.txt: 4.90MB [00:00, 22.9MB/s]                            


NSMC Accuracy (영어로 학습된 모델 → 한국어 평가): 0.5028 | F1: 0.0253


In [None]:
for i, review in enumerate(X_nsmc[:5]):
    print(f"리뷰 {i+1}: {review}\n")

리뷰 1: 아 더빙.. 진짜 짜증나네요 목소리

리뷰 2: 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나

리뷰 3: 너무재밓었다그래서보는것을추천한다

리뷰 4: 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정

리뷰 5: 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다



한국어 데이터셋에 대해서도 별도의 임베딩을 만들어서 평가

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
import torch

# 1. NSMC 데이터 불러오기
corpus = Korpora.load("nsmc")
X_nsmc = corpus.get_all_texts()
y_nsmc = list(map(int, corpus.get_all_labels()))

# 2. 한국어 데이터셋에 대해 새로 TF-IDF vectorizer 학습 및 희소행렬로 벡터화 (toarray() 제거)
vectorizer_ko = TfidfVectorizer(max_features=10000)
X_nsmc_vec = vectorizer_ko.fit_transform(X_nsmc)  # 희소행렬 유지 (scipy sparse matrix)

# 3. 커스텀 Dataset 정의: batch마다 희소 → dense 변환
class SparseDataset(Dataset):
    def __init__(self, X_sparse, y):
        self.X_sparse = X_sparse
        self.y = y

    def __len__(self):
        return self.X_sparse.shape[0]

    def __getitem__(self, idx):
        x = torch.tensor(self.X_sparse[idx].toarray(), dtype=torch.float32).squeeze(0)  # dense tensor
        y = torch.tensor(self.y[idx], dtype=torch.float32)
        return x, y.unsqueeze(0)  # shape 맞춤

# 4. Dataset과 DataLoader 생성
nsmc_dataset = SparseDataset(X_nsmc_vec, y_nsmc)
nsmc_loader = DataLoader(nsmc_dataset, batch_size=64, shuffle=False)

# 5. 모델 평가 준비 및 추론
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in nsmc_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        logits = model(X_batch)
        preds = (torch.sigmoid(logits) >= 0.5).float()

        all_preds.extend(preds.cpu().numpy().astype(int))
        all_labels.extend(y_batch.cpu().numpy().astype(int))

# 6. 평가 지표 출력
acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
print(f"NSMC Accuracy (한국어 TF-IDF + batch 희소→dense 변환): {acc:.4f} | F1: {f1:.4f}")



    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\wjl01\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\wjl

영어의 단어 중요도(빈도와 희귀도)를 기준으로 학습한 모델은, 한국어의 단어 분포와 맞지 않아 분류 성능이 크게 떨어졌다

즉, 영어로 학습된 모델은 영어 단어들의 분포와 중요도(TF-IDF 벡터 공간)에 최적화되어 있고, 한국어 데이터셋은 전혀 다른 분포와 단어 체계를 갖고 있기 때문에, 동일한 모델 구조로는 일반화가 어렵다.

# Dense Embedding

## bert-base-multilingual-cased

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
from Korpora import Korpora
import tensorflow_datasets as tfds
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. TFDS에서 imdb 불러오기 (as_supervised=True: (text, label) 튜플 반환)
train_ds, test_ds = tfds.load('imdb_reviews', split=['train', 'test'], as_supervised=True)

# 2. 텍스트/레이블 리스트로 변환 (tf.Tensor → str, int)
def tfds_to_list(dataset):
    texts = []
    labels = []
    for text, label in tfds.as_numpy(dataset):
        texts.append(text.decode('utf-8'))
        labels.append(int(label))
    return texts, labels

X_train_en, y_train_en = tfds_to_list(train_ds)
X_val_en, y_val_en = tfds_to_list(test_ds)

# 3. BERT 토크나이저 + 모델 초기화
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased").to(device)
model.eval()

# 4. CLS 임베딩 추출 함수 (배치 단위)
def extract_cls_embeddings(texts, batch_size=64):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        enc = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            output = model(**enc).last_hidden_state[:, 0]  # CLS 토큰 임베딩
        embeddings.append(output.cpu())
    return torch.cat(embeddings)

# 5. 임베딩 추출
X_train_vec = extract_cls_embeddings(X_train_en)
X_val_vec = extract_cls_embeddings(X_val_en)

# 6. TensorDataset & DataLoader 생성
train_ds = TensorDataset(X_train_vec, torch.tensor(y_train_en).unsqueeze(1).float())
val_ds = TensorDataset(X_val_vec, torch.tensor(y_val_en).unsqueeze(1).float())

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.H5PIH7_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.H5PIH7_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.H5PIH7_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:
# 7. 선형 분류기 정의
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim=768):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.model(x)

bert_model = SentimentClassifier(input_dim=768).to(device)

# 8. 손실 함수와 옵티마이저 정의
criterion_bert = nn.BCEWithLogitsLoss()
optimizer_bert = optim.Adam(bert_model.parameters(), lr=1e-3)

# 9. 학습 루프
epochs = 30
for epoch in range(epochs):
    bert_model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer_bert.zero_grad()
        logits = bert_model(X_batch)
        loss = criterion_bert(logits, y_batch)
        loss.backward()
        optimizer_bert.step()

        total_loss += loss.item()
        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).float()

        all_preds.extend(preds.cpu().numpy().astype(int))
        all_labels.extend(y_batch.cpu().numpy().astype(int))

    avg_loss = total_loss / len(train_loader)
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"[Train] Epoch {epoch+1} - Loss: {avg_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f}")

    # Validation
    bert_model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            logits = bert_model(X_batch)
            loss = criterion_bert(logits, y_batch)
            val_loss += loss.item()

            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()

            val_preds.extend(preds.cpu().numpy().astype(int))
            val_labels.extend(y_batch.cpu().numpy().astype(int))

    val_avg_loss = val_loss / len(val_loader)
    val_acc = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds)

    print(f"[Val]   Epoch {epoch+1} - Loss: {val_avg_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

[Train] Epoch 1 - Loss: 0.5490 | Acc: 0.7200 | F1: 0.7163
[Val]   Epoch 1 - Loss: 0.5065 | Acc: 0.7528 | F1: 0.7632
[Train] Epoch 2 - Loss: 0.4939 | Acc: 0.7639 | F1: 0.7599
[Val]   Epoch 2 - Loss: 0.4873 | Acc: 0.7644 | F1: 0.7726
[Train] Epoch 3 - Loss: 0.4795 | Acc: 0.7697 | F1: 0.7658
[Val]   Epoch 3 - Loss: 0.4842 | Acc: 0.7670 | F1: 0.7448
[Train] Epoch 4 - Loss: 0.4706 | Acc: 0.7759 | F1: 0.7726
[Val]   Epoch 4 - Loss: 0.4718 | Acc: 0.7736 | F1: 0.7559
[Train] Epoch 5 - Loss: 0.4591 | Acc: 0.7848 | F1: 0.7809
[Val]   Epoch 5 - Loss: 0.5048 | Acc: 0.7506 | F1: 0.7793
[Train] Epoch 6 - Loss: 0.4541 | Acc: 0.7842 | F1: 0.7819
[Val]   Epoch 6 - Loss: 0.4582 | Acc: 0.7849 | F1: 0.7821
[Train] Epoch 7 - Loss: 0.4486 | Acc: 0.7905 | F1: 0.7878
[Val]   Epoch 7 - Loss: 0.4550 | Acc: 0.7861 | F1: 0.7791
[Train] Epoch 8 - Loss: 0.4401 | Acc: 0.7956 | F1: 0.7932
[Val]   Epoch 8 - Loss: 0.4530 | Acc: 0.7871 | F1: 0.7784
[Train] Epoch 9 - Loss: 0.4374 | Acc: 0.7971 | F1: 0.7945
[Val]   Epoch 

### 한국어 데이터로 test

In [None]:
import random
from torch.utils.data import Subset

# 1. NSMC 데이터 불러오기
corpus = Korpora.load("nsmc")
X_nsmc = corpus.get_all_texts()
y_nsmc = list(map(int, corpus.get_all_labels()))

# 2. 랜덤 시드 고정
random.seed(42)

# 3. 전체 데이터 인덱스 생성 및 10% 샘플링
total_len = len(X_nsmc)
sample_size = int(total_len * 0.1)
all_indices = list(range(total_len))
sample_indices = random.sample(all_indices, sample_size)

# 4. 샘플링한 텍스트와 레이블만 추출
X_nsmc_sampled = [X_nsmc[i] for i in sample_indices]
y_nsmc_sampled = [y_nsmc[i] for i in sample_indices]

# 5. 임베딩 추출
X_nsmc_vec = extract_cls_embeddings(X_nsmc_sampled)

# 6. 텐서 변환 및 DataLoader 생성
X_nsmc_tensor = torch.tensor(X_nsmc_vec, dtype=torch.float32)
y_nsmc_tensor = torch.tensor(y_nsmc_sampled, dtype=torch.float32).unsqueeze(1)

nsmc_dataset = TensorDataset(X_nsmc_tensor, y_nsmc_tensor)
nsmc_loader = DataLoader(nsmc_dataset, batch_size=64)

# 7. 평가
bert_model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in nsmc_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        logits = bert_model(X_batch)
        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).float()

        all_preds.extend(preds.cpu().numpy().astype(int))
        all_labels.extend(y_batch.cpu().numpy().astype(int))

acc_nsmc = accuracy_score(all_labels, all_preds)
f1_nsmc = f1_score(all_labels, all_preds)
print(f"NSMC Accuracy (한국어 평가, 10% 샘플): {acc_nsmc:.4f} | F1: {f1_nsmc:.4f}")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/



[nsmc] download ratings_train.txt: 14.6MB [00:31, 471kB/s]                              
[nsmc] download ratings_test.txt: 4.90MB [00:00, 9.28MB/s]                            
  X_nsmc_tensor = torch.tensor(X_nsmc_vec, dtype=torch.float32)


NSMC Accuracy (한국어 평가, 10% 샘플): 0.6163 | F1: 0.5730


## xlm-roberta-base

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
from Korpora import Korpora
import tensorflow_datasets as tfds

# 1. TFDS에서 imdb 불러오기 (as_supervised=True: (text, label) 튜플 반환)
train_ds, test_ds = tfds.load('imdb_reviews', split=['train', 'test'], as_supervised=True)

# 2. 텍스트/레이블 리스트로 변환 (tf.Tensor → str, int)
def tfds_to_list(dataset):
    texts = []
    labels = []
    for text, label in tfds.as_numpy(dataset):
        texts.append(text.decode('utf-8'))
        labels.append(int(label))
    return texts, labels

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train_en, y_train_en = tfds_to_list(train_ds)
X_val_en, y_val_en = tfds_to_list(test_ds)

# 2. XLM-RoBERTa 모델과 토크나이저 초기화
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModel.from_pretrained("xlm-roberta-base").to(device)
model.eval()

# 4. CLS 임베딩 추출 함수 (배치 단위)
def extract_cls_embeddings(texts, batch_size=64):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        enc = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            output = model(**enc).last_hidden_state[:, 0]  # CLS 토큰 임베딩
        embeddings.append(output.cpu())
    return torch.cat(embeddings)

# 5. 임베딩 추출
X_train_vec = extract_cls_embeddings(X_train_en)
X_val_vec = extract_cls_embeddings(X_val_en)

# 6. TensorDataset & DataLoader 생성
train_ds = TensorDataset(X_train_vec, torch.tensor(y_train_en).unsqueeze(1).float())
val_ds = TensorDataset(X_val_vec, torch.tensor(y_val_en).unsqueeze(1).float())

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.XXZ9Y3_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.XXZ9Y3_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.XXZ9Y3_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [None]:
# 7. 선형 분류기 정의
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim=768):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.model(x)

bert_model = SentimentClassifier(input_dim=768).to(device)

# 8. 손실 함수와 옵티마이저 정의
criterion_bert = nn.BCEWithLogitsLoss()
optimizer_bert = optim.Adam(bert_model.parameters(), lr=1e-3)

# 9. 학습 루프
epochs = 30
for epoch in range(epochs):
    bert_model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer_bert.zero_grad()
        logits = bert_model(X_batch)
        loss = criterion_bert(logits, y_batch)
        loss.backward()
        optimizer_bert.step()

        total_loss += loss.item()
        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).float()

        all_preds.extend(preds.cpu().numpy().astype(int))
        all_labels.extend(y_batch.cpu().numpy().astype(int))

    avg_loss = total_loss / len(train_loader)
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"[Train] Epoch {epoch+1} - Loss: {avg_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f}")

    # Validation
    bert_model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            logits = bert_model(X_batch)
            loss = criterion_bert(logits, y_batch)
            val_loss += loss.item()

            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()

            val_preds.extend(preds.cpu().numpy().astype(int))
            val_labels.extend(y_batch.cpu().numpy().astype(int))

    val_avg_loss = val_loss / len(val_loader)
    val_acc = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds)

    print(f"[Val]   Epoch {epoch+1} - Loss: {val_avg_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

[Train] Epoch 1 - Loss: 0.5553 | Acc: 0.7346 | F1: 0.7308
[Val]   Epoch 1 - Loss: 0.4559 | Acc: 0.7846 | F1: 0.7456
[Train] Epoch 2 - Loss: 0.3895 | Acc: 0.8321 | F1: 0.8311
[Val]   Epoch 2 - Loss: 0.3683 | Acc: 0.8424 | F1: 0.8507
[Train] Epoch 3 - Loss: 0.3540 | Acc: 0.8494 | F1: 0.8483
[Val]   Epoch 3 - Loss: 0.3536 | Acc: 0.8448 | F1: 0.8314
[Train] Epoch 4 - Loss: 0.3346 | Acc: 0.8586 | F1: 0.8578
[Val]   Epoch 4 - Loss: 0.3299 | Acc: 0.8594 | F1: 0.8525
[Train] Epoch 5 - Loss: 0.3287 | Acc: 0.8618 | F1: 0.8608
[Val]   Epoch 5 - Loss: 0.3180 | Acc: 0.8662 | F1: 0.8664
[Train] Epoch 6 - Loss: 0.3262 | Acc: 0.8636 | F1: 0.8628
[Val]   Epoch 6 - Loss: 0.3130 | Acc: 0.8672 | F1: 0.8656
[Train] Epoch 7 - Loss: 0.3191 | Acc: 0.8672 | F1: 0.8667
[Val]   Epoch 7 - Loss: 0.3115 | Acc: 0.8680 | F1: 0.8646
[Train] Epoch 8 - Loss: 0.3128 | Acc: 0.8698 | F1: 0.8694
[Val]   Epoch 8 - Loss: 0.3065 | Acc: 0.8697 | F1: 0.8672
[Train] Epoch 9 - Loss: 0.3126 | Acc: 0.8688 | F1: 0.8680
[Val]   Epoch 

### 한국어 데이터로 test

In [None]:
import random
from torch.utils.data import Subset

# 1. NSMC 데이터 불러오기
corpus = Korpora.load("nsmc")
X_nsmc = corpus.get_all_texts()
y_nsmc = list(map(int, corpus.get_all_labels()))

# 2. 랜덤 시드 고정
random.seed(42)

# 3. 전체 데이터 인덱스 생성 및 10% 샘플링
total_len = len(X_nsmc)
sample_size = int(total_len * 0.1)
all_indices = list(range(total_len))
sample_indices = random.sample(all_indices, sample_size)

# 4. 샘플링한 텍스트와 레이블만 추출
X_nsmc_sampled = [X_nsmc[i] for i in sample_indices]
y_nsmc_sampled = [y_nsmc[i] for i in sample_indices]

# 5. 임베딩 추출
X_nsmc_vec = extract_cls_embeddings(X_nsmc_sampled)

# 6. 텐서 변환 및 DataLoader 생성
X_nsmc_tensor = torch.tensor(X_nsmc_vec, dtype=torch.float32)
y_nsmc_tensor = torch.tensor(y_nsmc_sampled, dtype=torch.float32).unsqueeze(1)

nsmc_dataset = TensorDataset(X_nsmc_tensor, y_nsmc_tensor)
nsmc_loader = DataLoader(nsmc_dataset, batch_size=64)

# 7. 평가
bert_model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in nsmc_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        logits = bert_model(X_batch)
        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).float()

        all_preds.extend(preds.cpu().numpy().astype(int))
        all_labels.extend(y_batch.cpu().numpy().astype(int))

acc_nsmc = accuracy_score(all_labels, all_preds)
f1_nsmc = f1_score(all_labels, all_preds)
print(f"NSMC Accuracy (한국어 평가, 10% 샘플): {acc_nsmc:.4f} | F1: {f1_nsmc:.4f}")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/



[nsmc] download ratings_train.txt: 14.6MB [00:00, 64.2MB/s]                            
[nsmc] download ratings_test.txt: 4.90MB [00:00, 36.2MB/s]                           
  X_nsmc_tensor = torch.tensor(X_nsmc_vec, dtype=torch.float32)


NSMC Accuracy (한국어 평가, 10% 샘플): 0.7398 | F1: 0.6978


# Result (Accuracy, F1, AUROC)

## Multi lingual direct

In [None]:
import tensorflow_datasets as tfds
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tqdm import tqdm  # ✅ tqdm import

# 1. 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. IMDB 데이터 로드 (tfds)
ds_train = tfds.load('imdb_reviews', split='train', as_supervised=True)
X_train_imdb, y_train_imdb = [], []
for text, label in tfds.as_numpy(ds_train):
    X_train_imdb.append(text.decode('utf-8'))
    y_train_imdb.append(int(label))

print(f"Loaded IMDB train samples: {len(X_train_imdb)}")

# 3. 검증 데이터 로드
df_valid = pd.read_csv("/content/Valid_target.csv")
X_valid = df_valid['text'].tolist()
y_valid = df_valid['label'].tolist()
print(f"Loaded validation samples: {len(X_valid)}")

# 4. Tokenizer 및 BERT 모델 (freeze 설정)
MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert = AutoModel.from_pretrained(MODEL_NAME)
for param in bert.parameters():
    param.requires_grad = False  # fine-tuning 하지 않음

# 5. 데이터셋 정의
MAX_LEN = 128

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=MAX_LEN)
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # BCE Loss
        return item

train_dataset = TextDataset(X_train_imdb, y_train_imdb)
valid_dataset = TextDataset(X_valid, y_valid)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64)

# 6. Classifier 정의 (BERT + classifier head)
class SentimentClassifier(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert
        self.classifier = nn.Sequential(
            nn.Linear(bert.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # BERT 부분은 fine-tuning 안 함
            output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            cls = output.last_hidden_state[:, 0]
        return self.classifier(cls).squeeze(1)

model = SentimentClassifier(bert).to(device)

# 7. 손실 함수 및 옵티마이저
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=2e-4)

# 8. 평가 함수
def evaluate(model, loader):
    model.eval()
    all_logits, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            all_logits.extend(logits.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    probs = torch.sigmoid(torch.tensor(all_logits)).numpy()
    preds = (probs >= 0.5).astype(int)

    acc = accuracy_score(all_labels, preds)
    f1 = f1_score(all_labels, preds)
    try:
        auroc = roc_auc_score(all_labels, probs)
    except:
        auroc = float('nan')
    return acc, f1, auroc

# 9. 학습 루프
EPOCHS = 5
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0

    print(f"\nEpoch {epoch}/{EPOCHS}")
    train_iter = tqdm(train_loader, desc="Training", leave=False)
    for batch in train_iter:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        train_iter.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    acc, f1, auroc = evaluate(model, valid_loader)
    print(f"[Epoch {epoch}] Loss: {avg_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | AUROC: {auroc:.4f}")

Loaded IMDB train samples: 25000
Loaded validation samples: 5000


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]


Epoch 1/5




[Epoch 1] Loss: 0.6724 | Acc: 0.6484 | F1: 0.7343 | AUROC: 0.8699

Epoch 2/5




[Epoch 2] Loss: 0.6213 | Acc: 0.7012 | F1: 0.7624 | AUROC: 0.8852

Epoch 3/5




[Epoch 3] Loss: 0.5740 | Acc: 0.7712 | F1: 0.8004 | AUROC: 0.8903

Epoch 4/5




[Epoch 4] Loss: 0.5421 | Acc: 0.7948 | F1: 0.8147 | AUROC: 0.8949

Epoch 5/5


                                                           

[Epoch 5] Loss: 0.5218 | Acc: 0.7626 | F1: 0.7981 | AUROC: 0.8979




## Translation direct

In [None]:
class ClfDataset(Dataset):
    def __init__(self, df, col='en'):
        self.texts = df[col]
        self.labels = df['label']

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_fn_clf(batch):
    texts, labels = zip(*batch)
    return list(texts), torch.tensor(labels, dtype=torch.long)

train_dataset = ClfDataset(imdb_df, col='ko')
val_dataset = ClfDataset(naver, col='text')
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn_clf)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn_clf)

class DirectClassifier(nn.Module):
    def __init__(self,hidden_dim=128):
        super().__init__()
        self.encoder = AutoModel.from_pretrained("klue/bert-base")
        self.tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )

    def forward(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        inputs = {k: v.to(next(self.parameters()).device) for k, v in inputs.items()}
        outputs = self.encoder(**inputs)
        cls = outputs.last_hidden_state[:, 0]  # CLS 토큰
        sentiment_logits = self.classifier(cls)
        return sentiment_logits

num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DirectClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
save_path = f"models/classifier_direct.pth"
best_val_acc = 0.0
best_model_state = None

for epoch in range(num_epochs):
    total_loss, correct, total = 0, 0, 0
    model.train()
    for texts, labels in tqdm(train_loader, desc=f"Epoch {epoch+1} - Train"):
      labels = labels.to(device)
      logits = model(texts)
      loss = criterion(logits, labels)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      total_loss += loss.item()
      pred = logits.argmax(dim=1)
      correct += (pred == labels).sum().item()
      total += labels.size(0)

    train_acc = correct / total
    print(f"[Train] Epoch {epoch+1}, Loss: {total_loss:.4f}, Acc: {train_acc:.4f}")

    model.eval()
    val_correct, val_total = 0, 0
    val_loss = 0

    with torch.no_grad():
        for texts, labels in tqdm(val_loader, desc=f"Epoch {epoch+1} - Valid"):
            labels = labels.to(device)
            logits = model(texts)
            loss = criterion(logits, labels)
            val_loss += loss.item()

            pred = logits.argmax(dim=1)
            val_correct += (pred == labels).sum().item()
            val_total += labels.size(0)

    val_acc = val_correct / val_total
    print(f"[Valid] Epoch {epoch+1}, Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict()
        torch.save(best_model_state, save_path)
        print(f">> Saved new best model with val acc {best_val_acc:.4f}")

model.load_state_dict(best_model_state)

Epoch 1 - Train: 100%|██████████| 611/611 [05:20<00:00,  1.91it/s]


[Train] Epoch 1, Loss: 199.0415, Acc: 0.8613


Epoch 1 - Valid: 100%|██████████| 313/313 [00:05<00:00, 62.03it/s]


[Valid] Epoch 1, Loss: 107.9800, Acc: 0.8474
>> Saved new best model with val acc 0.8474


Epoch 2 - Train: 100%|██████████| 611/611 [05:21<00:00,  1.90it/s]


[Train] Epoch 2, Loss: 113.9854, Acc: 0.9313


Epoch 2 - Valid: 100%|██████████| 313/313 [00:05<00:00, 61.92it/s]


[Valid] Epoch 2, Loss: 121.7105, Acc: 0.8282


Epoch 3 - Train: 100%|██████████| 611/611 [05:19<00:00,  1.91it/s]


[Train] Epoch 3, Loss: 72.4248, Acc: 0.9589


Epoch 3 - Valid: 100%|██████████| 313/313 [00:05<00:00, 61.82it/s]


[Valid] Epoch 3, Loss: 150.2647, Acc: 0.8122


Epoch 4 - Train: 100%|██████████| 611/611 [05:21<00:00,  1.90it/s]


[Train] Epoch 4, Loss: 47.3205, Acc: 0.9750


Epoch 4 - Valid: 100%|██████████| 313/313 [00:05<00:00, 61.87it/s]


[Valid] Epoch 4, Loss: 209.8513, Acc: 0.8010


Epoch 5 - Train: 100%|██████████| 611/611 [05:20<00:00,  1.91it/s]


[Train] Epoch 5, Loss: 40.1986, Acc: 0.9802


Epoch 5 - Valid: 100%|██████████| 313/313 [00:05<00:00, 61.87it/s]


[Valid] Epoch 5, Loss: 170.4178, Acc: 0.8258


Epoch 6 - Train: 100%|██████████| 611/611 [05:20<00:00,  1.90it/s]


[Train] Epoch 6, Loss: 67.6818, Acc: 0.9571


Epoch 6 - Valid: 100%|██████████| 313/313 [00:05<00:00, 61.90it/s]


[Valid] Epoch 6, Loss: 224.9315, Acc: 0.6226


Epoch 7 - Train: 100%|██████████| 611/611 [05:20<00:00,  1.91it/s]


[Train] Epoch 7, Loss: 424.1522, Acc: 0.5175


Epoch 7 - Valid: 100%|██████████| 313/313 [00:05<00:00, 61.78it/s]


[Valid] Epoch 7, Loss: 219.0720, Acc: 0.4998


Epoch 8 - Train:  18%|█▊        | 111/611 [00:59<04:27,  1.87it/s]


KeyboardInterrupt: 

## Rich-resource case

In [None]:
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import random
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 2. 데이터 로딩
train_df = pd.read_csv("Train_target.csv")
valid_df = pd.read_csv("Valid_target.csv")
print(f"Original Train samples: {len(train_df)}, Valid samples: {len(valid_df)}")
# 3. 랜덤 시드 고정 및 20,000개 샘플링 (train 데이터)
random.seed(42)
np.random.seed(42)
if len(train_df) > 20000:
    train_df = train_df.sample(n=20000, random_state=42).reset_index(drop=True)
print(f"Sampled Train samples: {len(train_df)}")
# 4. 토크나이저, 모델 초기화 (klue/bert-base)
MODEL_NAME = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)
# 5. Dataset 클래스 정의
MAX_LEN = 128
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding='max_length', max_length=MAX_LEN)
        self.labels = labels.tolist()
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # float for BCEWithLogitsLoss
        return item
train_dataset = TextDataset(train_df['text'], train_df['label'])
valid_dataset = TextDataset(valid_df['text'], valid_df['label'])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64)
# 6. Classifier 정의 (bert CLS output -> classifier)
class SentimentClassifier(nn.Module):
    def __init__(self, bert_model, dropout=0.2):
        super().__init__()
        self.bert = bert_model
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.bert.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]  # CLS token
        logits = self.classifier(cls_output)
        return logits.squeeze(1)
model = SentimentClassifier(bert_model).to(device)
# 7. 손실 함수 및 옵티마이저
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
# 8. 평가 함수
def evaluate(model, loader):
    model.eval()
    all_logits, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids, attention_mask)
            all_logits.extend(logits.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    probs = torch.sigmoid(torch.tensor(all_logits)).numpy()
    preds = (probs >= 0.5).astype(int)
    acc = accuracy_score(all_labels, preds)
    f1 = f1_score(all_labels, preds)
    try:
        auroc = roc_auc_score(all_labels, probs)
    except:
        auroc = float('nan')
    return acc, f1, auroc
# 9. 학습 루프
EPOCHS = 5
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    acc, f1, auroc = evaluate(model, valid_loader)
    print(f"[Epoch {epoch}] Loss: {avg_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | AUROC: {auroc:.4f}")








Original Train samples: 180000, Valid samples: 5000
Sampled Train samples: 20000


100%|██████████| 625/625 [06:29<00:00,  1.60it/s]


[Epoch 1] Loss: 0.2344 | Acc: 0.9316 | F1: 0.9327 | AUROC: 0.9771


100%|██████████| 625/625 [06:29<00:00,  1.60it/s]


[Epoch 2] Loss: 0.1571 | Acc: 0.9302 | F1: 0.9322 | AUROC: 0.9769


  3%|▎         | 16/625 [00:10<06:43,  1.51it/s]


KeyboardInterrupt: 

## Low-resource case

In [None]:
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import random
import numpy as np

# 1. Device 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. 데이터 로딩
train_df = pd.read_csv("/content/Train_target.csv")
valid_df = pd.read_csv("/content/Valid_target.csv")

print(f"Original Train samples: {len(train_df)}, Valid samples: {len(valid_df)}")

# 3. 랜덤 시드 고정 및 2,000개 샘플링 (train 데이터)
random.seed(42)
np.random.seed(42)

if len(train_df) > 2000:
    train_df = train_df.sample(n=2000, random_state=42).reset_index(drop=True)
print(f"Sampled Train samples: {len(train_df)}")

# 4. 토크나이저, 모델 초기화 (klue/bert-base)
MODEL_NAME = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)

# 5. Dataset 클래스 정의
MAX_LEN = 128
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding='max_length', max_length=MAX_LEN)
        self.labels = labels.tolist()
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # float for BCEWithLogitsLoss
        return item

train_dataset = TextDataset(train_df['text'], train_df['label'])
valid_dataset = TextDataset(valid_df['text'], valid_df['label'])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64)

# 6. Classifier 정의 (bert CLS output -> classifier)
class SentimentClassifier(nn.Module):
    def __init__(self, bert_model, dropout=0.2):
        super().__init__()
        self.bert = bert_model
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.bert.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]  # CLS token
        logits = self.classifier(cls_output)
        return logits.squeeze(1)

model = SentimentClassifier(bert_model).to(device)

# 7. 손실 함수 및 옵티마이저
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# 8. 평가 함수
def evaluate(model, loader):
    model.eval()
    all_logits, all_labels = [], []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            all_logits.extend(logits.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    probs = torch.sigmoid(torch.tensor(all_logits)).numpy()
    preds = (probs >= 0.5).astype(int)

    acc = accuracy_score(all_labels, preds)
    f1 = f1_score(all_labels, preds)
    try:
        auroc = roc_auc_score(all_labels, probs)
    except:
        auroc = float('nan')
    return acc, f1, auroc

# 9. 학습 루프
EPOCHS = 5
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    acc, f1, auroc = evaluate(model, valid_loader)
    print(f"[Epoch {epoch}] Loss: {avg_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | AUROC: {auroc:.4f}")

Original Train samples: 180000, Valid samples: 5000
Sampled Train samples: 2000
[Epoch 1] Loss: 0.3728 | Acc: 0.9228 | F1: 0.9227 | AUROC: 0.9663
[Epoch 2] Loss: 0.1883 | Acc: 0.9156 | F1: 0.9172 | AUROC: 0.9679
[Epoch 3] Loss: 0.1197 | Acc: 0.9240 | F1: 0.9239 | AUROC: 0.9684
[Epoch 4] Loss: 0.0932 | Acc: 0.9218 | F1: 0.9224 | AUROC: 0.9663
[Epoch 5] Loss: 0.0731 | Acc: 0.9180 | F1: 0.9178 | AUROC: 0.9668
