In [8]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## 0. 학습 준비
- 데이터셋 로드
- 환경 설정 (패키지 설치 및 로드)

In [9]:
import os

# === 입출력 경로 ===
os.environ.setdefault("CSV_PATH", "schedule_dataset_augmented_epoch_2.csv")   # seed 파일 경로

# 배치 단위 (한 번에 몇 개의 항목을 생성시킬지)
os.environ.setdefault("BATCH_SIZE", "10")

# 생성 다양성
os.environ.setdefault("TEMPERATURE", "0.8")
os.environ.setdefault("TOP_P", "0.9")

print("환경설정 완료")


환경설정 완료


In [10]:

# 환경 설정과 데이터 로드
import re, random, json, types
import numpy as np
import pandas as pd
from pathlib import Path

CSV_PATH = os.getenv("CSV_PATH")

df = pd.read_csv(CSV_PATH)
print("Loaded shape:", df.shape)
print(df.head())

Loaded shape: (2000, 8)
              sentence             domain    task  label  confidence   source  \
0     배경 러프 수정본 여기 있어요  design_production    NONE      0        0.99  teacher   
1  오늘 밤에 색감 체크 가능하신가요?  design_production  CREATE      1        0.92  teacher   
2        싱크표 다시 올려드릴게요  design_production    NONE      0        0.98  teacher   
3   금요일 오후 킥오프 미팅 할까요?  design_production  CREATE      1        0.95  teacher   
4       오늘 콘티 피드백만 주세요  design_production    NONE      0        0.97  teacher   

   seed_id  epoch  
0      0.0      0  
1      1.0      0  
2      2.0      0  
3      3.0      0  
4      4.0      0  


In [6]:
 
# 텍스트/라벨 컬럼 자동 감지 및 분할
from sklearn.model_selection import train_test_split

text_col = "sentence" if "sentence" in df.columns else df.columns[0]
label_col = "label" if "label" in df.columns else df.columns[-1]
print("Using columns -> text:", text_col, "label:", label_col)

df = df[[text_col, label_col]].dropna().reset_index(drop=True)
X = df[text_col].astype(str).values
y = df[label_col].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=38, stratify=y
)
len(X_train), len(X_test)

Using columns -> text: sentence label: label


(1600, 400)

## TF-IDF 분류기

In [7]:

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# 1) char ngram 기반
tfidf_char = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(2,5), min_df=3)),
    ("clf", LinearSVC())
])
tfidf_char.fit(X_train, y_train)
pred_char = tfidf_char.predict(X_test)
acc_char = accuracy_score(y_test, pred_char)
print("TF‑IDF char Accuracy:", acc_char)
print(classification_report(y_test, pred_char))

# 2) word 기반
tfidf_word = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="word", token_pattern=r"(?u)\b\w+\b", min_df=3)),
    ("clf", LinearSVC())
])
tfidf_word.fit(X_train, y_train)
pred_word = tfidf_word.predict(X_test)
acc_word = accuracy_score(y_test, pred_word)
print("TF‑IDF word Accuracy:", acc_word)
print(classification_report(y_test, pred_word))

TF‑IDF char Accuracy: 0.9925
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       148
           1       0.99      1.00      0.99       252

    accuracy                           0.99       400
   macro avg       0.99      0.99      0.99       400
weighted avg       0.99      0.99      0.99       400

TF‑IDF word Accuracy: 0.9975
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       148
           1       1.00      1.00      1.00       252

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [11]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, average_precision_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=38)

scoring = {
    "f1": "f1",
    "precision": "precision",
    "recall": "recall",
    # PR-AUC (불균형일 때 유용). LinearSVC는 decision_function 제공하므로 OK
    "prauc": "average_precision"
}

# char n-gram 파이프라인 CV
cv_char = cross_validate(tfidf_char, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
print("[CV-char] F1: %.3f ± %.3f | P: %.3f | R: %.3f | PR-AUC: %.3f" % (
    cv_char["test_f1"].mean(), cv_char["test_f1"].std(),
    cv_char["test_precision"].mean(), cv_char["test_recall"].mean(),
    cv_char["test_prauc"].mean()
))

# word 기반 파이프라인 CV
cv_word = cross_validate(tfidf_word, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
print("[CV-word] F1: %.3f ± %.3f | P: %.3f | R: %.3f | PR-AUC: %.3f" % (
    cv_word["test_f1"].mean(), cv_word["test_f1"].std(),
    cv_word["test_precision"].mean(), cv_word["test_recall"].mean(),
    cv_word["test_prauc"].mean()
))


[CV-char] F1: 0.994 ± 0.005 | P: 0.993 | R: 0.994 | PR-AUC: 1.000
[CV-word] F1: 0.988 ± 0.009 | P: 0.988 | R: 0.988 | PR-AUC: 0.999


In [12]:
from sklearn.model_selection import GridSearchCV

# char n-gram 후보들
pipe_char = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(2,5), min_df=3)),
    ("clf", LinearSVC())
])

param_char = {
    "tfidf__ngram_range": [(2,4), (2,5), (3,5)],
    "tfidf__min_df": [2,3,5],
    "clf__C": [0.5, 1.0, 2.0, 5.0]
}

gscv_char = GridSearchCV(
    pipe_char, param_char, cv=cv, scoring="f1", n_jobs=-1, verbose=1
)
gscv_char.fit(X_train, y_train)
print("Best(char):", gscv_char.best_params_, "F1=%.3f" % gscv_char.best_score_)

# word 기반 후보들
pipe_word = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="word", token_pattern=r"(?u)\b\w+\b", min_df=3)),
    ("clf", LinearSVC())
])

param_word = {
    "tfidf__min_df": [2,3,5],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.5, 1.0, 2.0, 5.0]
}

gscv_word = GridSearchCV(
    pipe_word, param_word, cv=cv, scoring="f1", n_jobs=-1, verbose=1
)
gscv_word.fit(X_train, y_train)
print("Best(word):", gscv_word.best_params_, "F1=%.3f" % gscv_word.best_score_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best(char): {'clf__C': 1.0, 'tfidf__min_df': 2, 'tfidf__ngram_range': (2, 4)} F1=0.994
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best(word): {'clf__C': 1.0, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)} F1=0.993


In [13]:
from sklearn.metrics import classification_report, average_precision_score

# 둘 중 CV 성능 좋은 쪽 선택
best_model = gscv_char if gscv_char.best_score_ >= gscv_word.best_score_ else gscv_word

# train 전체로 재학습 (best_estimator_가 이미 train으로 핏된 상태지만, 안전하게 다시 학습 원하면:)
best_model = best_model.best_estimator_
best_model.fit(X_train, y_train)

pred = best_model.predict(X_test)

print("=== Hold-out Test ===")
print(classification_report(y_test, pred, digits=3))

# PR-AUC도 같이 (decision_function 사용)
scores = best_model.decision_function(X_test)
print("Test PR-AUC:", average_precision_score(y_test, scores))


=== Hold-out Test ===
              precision    recall  f1-score   support

           0      1.000     0.986     0.993       148
           1      0.992     1.000     0.996       252

    accuracy                          0.995       400
   macro avg      0.996     0.993     0.995       400
weighted avg      0.995     0.995     0.995       400

Test PR-AUC: 0.9999687539057618
