In [1]:
import torch
from torch import nn

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import f1_score

from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm
2025-04-01 17:30:54.430805: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-01 17:30:56.254887: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-01 17:30:56.590499: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.4/lib64:
2025-04-01 17:30

In [2]:
train = pd.read_csv('data/train.csv', encoding='utf8')
test = pd.read_csv('data/test.csv')

In [3]:
keyword_ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
location_ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# hash_url = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# train 엔지니어링
train_keyword_ohe = keyword_ohe.fit_transform(train[['keyword']].fillna("unknown"))
train_location_ohe = location_ohe.fit_transform(train[['location']].fillna("unknown"))
train['has_url'] = train['text'].apply(lambda x: int('http' in x))
train['char_count'] = train['text'].apply(len)
train['word_count'] = train['text'].apply(lambda x: len(x.split()))
train_feats = train[['char_count', 'word_count', 'has_url']].values

# 테스트 엔지니어링
test_keyword_ohe = keyword_ohe.transform(test[['keyword']].fillna("unknown"))
test_ohe_location = location_ohe.transform(test[['location']].fillna("unknown"))
# test_hash_url_ohe = hash_url.transform(test[['text']].fillna("unknown"))
test['char_count'] = test['text'].apply(len)
test['word_count'] = test['text'].apply(lambda x: len(x.split()))
test['has_url'] = test['text'].apply(lambda x: int('http' in x))
test_feats = test[['char_count', 'word_count', 'has_url']].values

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from tqdm import tqdm

# ✅ 1. 데이터 로드
train = pd.read_csv('data/train.csv', encoding='utf-8-sig')
test = pd.read_csv('data/test.csv', encoding='utf-8-sig')

# ✅ 2. 피처 엔지니어링
def generate_features(df):
    df = df.copy()
    df['char_count'] = df['text'].apply(len)
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['has_url'] = df['text'].apply(lambda x: int('http' in x))
    return df[['char_count', 'word_count', 'has_url']].values

# ✅ Split 먼저 → 그 다음 피처 생성 (순서 주의)
X = train['text']
y = train['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# ✅ 피처 생성
train_feats = generate_features(X_train.to_frame(name='text'))
val_feats = generate_features(X_val.to_frame(name='text'))
test_feats = generate_features(test[['text']])

# ✅ 3. 토크나이저
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# ✅ 4. 데이터셋 정의
class TweetDataset(Dataset):
    def __init__(self, texts, labels=None, extra_feats=None, tokenizer=None, max_len=128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True) if labels is not None else None
        self.extra_feats = extra_feats
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.item()

        encoded = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoded.items()}

        if self.extra_feats is not None:
            item['extra_feats'] = torch.tensor(self.extra_feats[idx], dtype=torch.float)

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item

# ✅ 5. 모델 정의
class CustomBERTClassifier(nn.Module):
    def __init__(self, hidden_size=768, extra_feat_dim=3):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Linear(hidden_size + extra_feat_dim, 2)

    def forward(self, input_ids, attention_mask, extra_feats, **kwargs):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS]
        combined = torch.cat((cls_output, extra_feats), dim=1)
        logits = self.classifier(combined)
        return logits

# ✅ 6. 데이터로더 생성
train_dataset = TweetDataset(X_train, y_train, train_feats, tokenizer)
val_dataset = TweetDataset(X_val, y_val, val_feats, tokenizer)
test_dataset = TweetDataset(test['text'], labels=None, extra_feats=test_feats, tokenizer=tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# ✅ 7. 학습 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomBERTClassifier().to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# ✅ 8. 학습 루프
for epoch in range(4):
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        labels = batch.pop('labels').to(device)
        extra_feats = batch.pop('extra_feats').to(device)
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(extra_feats=extra_feats, **batch)
        loss = loss_fn(outputs, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

# ✅ 9. 검증
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        labels = batch.pop('labels')
        extra_feats = batch.pop('extra_feats').to(device)
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(extra_feats=extra_feats, **batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

f1 = f1_score(all_labels, all_preds)
print(f"\n✅ BERT 기반 분류기 검증 F1 Score: {round(f1, 4)}")

# ✅ 10. 테스트 예측 + 제출
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        extra_feats = batch.pop('extra_feats').to(device)
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(extra_feats=extra_feats, **batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        test_preds.extend(preds)

submission = pd.DataFrame({
    'id': test['id'],
    'target': test_preds
})
submission.to_csv("submission.csv", index=False)


ImportError: cannot import name 'AdamW' from 'transformers' (/mnt/d/workspace/TIL/.venv/lib/python3.10/site-packages/transformers/__init__.py)