In [1]:
!pip install --upgrade torch torchvision torchaudio transformers
!pip install datasets transformers scikit-learn torch pandas

[0m

In [2]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report
import re
from transformers import pipeline
import random

In [3]:
# 1. 데이터셋 로드 

print("Loading AG News dataset...")
dataset = load_dataset("ag_news")

# 클래스 라벨 정의 (AG News 기준)
# 0: World, 1: Sports, 2: Business, 3: Sci/Tech
labels = ["World", "Sports", "Business", "Sci/Tech"]

test_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

print(f"Test data size: {len(test_dataset)}")
print(f"Example data: {test_dataset[0]}")

Loading AG News dataset...
Test data size: 1000
Example data: {'text': 'Indian board plans own telecast of Australia series The Indian cricket board said on Wednesday it was making arrangements on its own to broadcast next month #39;s test series against Australia, which is under threat because of a raging TV rights dispute.', 'label': 1}


In [4]:
# 1. 데이터셋 전처리

def preprocess_text(data):
    text = data['text']
    # 1. 소문자 변환
    text = text.lower()
    
    # 2. 불필요한 특수문자 제거 (정규표현식 사용)
    # 알파벳, 숫자, 공백만 남기고 다 제거
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # 3. 다중 공백 제거 (공백이 여러 개면 하나로 줄임)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return {"text": text, "label": data['label']}


print("Applying preprocessing...")

test_dataset = test_dataset.map(preprocess_text)

print(f"Preprocessed Example: {test_dataset[0]['text']}")

Applying preprocessing...
Preprocessed Example: indian board plans own telecast of australia series the indian cricket board said on wednesday it was making arrangements on its own to broadcast next month 39s test series against australia which is under threat because of a raging tv rights dispute


In [5]:
# 2. 나이브 베이스라인 (Keyword-based Classifier)

def naive_baseline(text):
    text = text.lower()
    
    # 각 카테고리별 대표 키워드 정의 (단순 규칙)
    keywords = {
        0: ["war", "president", "minister", "politic", "country", "police", "gov", "crime", "world"],         # World
        1: ["game", "cup", "team", "score", "win", "season", "olympic", "defeat", "sports"],                  # Sports
        2: ["stock", "market", "company", "price", "profit", "corp", "bank", "econom", "business"],           # Business
        3: ["computer", "software", "microsoft", "internet", "space", "chip", "service","science", "tech"]    # Sci/Tech
    }
    
    # 키워드 매칭 점수 계산
    scores = {0: 0, 1: 0, 2: 0, 3: 0}
    for label, words in keywords.items():
        for word in words:
            if word in text:
                scores[label] += 1
    
    # 가장 점수가 높은 클래스 선택 (동점이면 후보들 중에서 무작위로 하나 선택)
    max_score = max(scores.values())
    
    candidates = [label for label, score in scores.items() if score == max_score]
    
    pred = random.choice(candidates)
    
    return pred

In [6]:
# 3. AI 파이프라인 구성 (Pre-trained Model)

# AG News에 대해 이미 학습된 모델 사용 (Fine-tuning 불필요)
model_name = "fabriceyhc/bert-base-uncased-ag_news"

print(f"\nLoading AI Pipeline with model: {model_name}...")

# 파이프라인 생성 (자동으로 토크나이저와 모델을 로드함)
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name, truncation=True)


Loading AI Pipeline with model: fabriceyhc/bert-base-uncased-ag_news...


Device set to use cuda:0


In [7]:
# 4. baseline test
print("\nRunning Naive Baseline...")
baseline_preds = [naive_baseline(item['text']) for item in test_dataset]

# 5. ai test
print("Running AI Pipeline...")

# 데이터에서 텍스트만 추출
texts = test_dataset['text']

# 파이프라인을 통해 예측 수행
ai_results = classifier(list(texts))

# 결과 변환 (모델 출력인 LABEL_0, World 등을 정수 인덱스 0, 1, 2, 3으로 변환)
# 이 모델은 라벨을 "World", "Sports" 등으로 출력함. 이를 정수로 매핑해야 함.
label_map = {"World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3}

ai_preds = []
for result in ai_results:
    # 모델이 출력한 라벨 스트링을 정수로 변환
    pred_label_str = result['label']
    ai_preds.append(int(pred_label_str.split("_")[-1]))


Running Naive Baseline...
Running AI Pipeline...


In [8]:

ground_truth = test_dataset['label']

# 베이스라인 평가
baseline_acc = accuracy_score(ground_truth, baseline_preds)
print(f"=== Baseline Accuracy: {baseline_acc:.4f} ===")
print(classification_report(ground_truth, baseline_preds, target_names=labels))

# AI 파이프라인 평가
ai_acc = accuracy_score(ground_truth, ai_preds)
print(f"=== AI Pipeline Accuracy: {ai_acc:.4f} ===")
print(classification_report(ground_truth, ai_preds, target_names=labels))

=== Baseline Accuracy: 0.5410 ===
              precision    recall  f1-score   support

       World       0.53      0.53      0.53       266
      Sports       0.58      0.71      0.64       246
    Business       0.50      0.53      0.52       246
    Sci/Tech       0.55      0.40      0.46       242

    accuracy                           0.54      1000
   macro avg       0.54      0.54      0.54      1000
weighted avg       0.54      0.54      0.54      1000

=== AI Pipeline Accuracy: 0.8970 ===
              precision    recall  f1-score   support

       World       0.94      0.86      0.90       266
      Sports       0.93      1.00      0.96       246
    Business       0.84      0.89      0.87       246
    Sci/Tech       0.88      0.85      0.86       242

    accuracy                           0.90      1000
   macro avg       0.90      0.90      0.90      1000
weighted avg       0.90      0.90      0.90      1000

