In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 필요한 패키지 불러오기
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import gensim.downloader as api
from tensorflow.keras.callbacks import EarlyStopping

# 데이터 로드
train_df = pd.read_csv('train_for_NLP.csv')
test_df = pd.read_csv('test_cleaned.csv')

# train 데이터에서 특성과 레이블 분리
X_train = train_df['combined_str']
y_train = train_df['target']

# test 데이터의 특성
X_test = test_df['combined_str']

# GloVe 임베딩 로드 (100차원 벡터)
glove_model = api.load("glove-wiki-gigaword-100")

# GloVe로 문서 벡터화
def vectorize_glove(text, model):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(100)  # GloVe 벡터 크기

# 문서를 GloVe 벡터로 변환
X_train_glove = np.array([vectorize_glove(text, glove_model) for text in X_train])
X_test_glove = np.array([vectorize_glove(text, glove_model) for text in X_test])

# 스케일링
scaler = StandardScaler()
X_train_glove_scaled = scaler.fit_transform(X_train_glove)
X_test_glove_scaled = scaler.transform(X_test_glove)

# CNN 입력에 맞게 데이터 차원 변경 (1D CNN에서는 입력이 3차원이어야 함)
X_test_glove_scaled = np.expand_dims(X_test_glove_scaled, axis=-1)

# 1D CNN 모델 생성 함수
def create_cnn_model(optimizer='adam', filters=64, kernel_size=3, pool_size=2):
    model = tf.keras.models.Sequential()

    # 1D Convolutional Layer
    model.add(tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', input_shape=(X_train_glove_scaled.shape[1], 1)))

    # MaxPooling Layer
    model.add(tf.keras.layers.MaxPooling1D(pool_size=pool_size))

    # Flatten Layer
    model.add(tf.keras.layers.Flatten())

    # Dense Layer
    model.add(tf.keras.layers.Dense(64, activation='relu'))

    # Output Layer (이진 분류)
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    # Compile Model
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model

# K-fold Cross Validation 설정
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
accuracy_scores = []

for train_idx, val_idx in kfold.split(X_train_glove_scaled):
    # 각 Fold에서의 Train, Validation 데이터 설정
    X_train_fold, X_val_fold = X_train_glove_scaled[train_idx], X_train_glove_scaled[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # CNN 입력에 맞게 데이터 차원 변경
    X_train_fold = np.expand_dims(X_train_fold, axis=-1)
    X_val_fold = np.expand_dims(X_val_fold, axis=-1)

    # 모델 생성
    cnn_model = create_cnn_model()

    # EarlyStopping 콜백 정의
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # 모델 학습
    print(f'Training on fold {fold_no}...')
    history = cnn_model.fit(
        X_train_fold, y_train_fold,
        validation_data=(X_val_fold, y_val_fold),
        epochs=10,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=0  # 학습 과정 출력 제한
    )

    # 각 폴드에서의 정확도 평가
    scores = cnn_model.evaluate(X_val_fold, y_val_fold, verbose=0)
    print(f'Fold {fold_no} - Validation Accuracy: {scores[1]}')
    accuracy_scores.append(scores[1])
    fold_no += 1

# 각 Fold의 평균 정확도 계산
mean_accuracy = np.mean(accuracy_scores)
print(f'Mean K-Fold Validation Accuracy: {mean_accuracy}')

# 최종 모델로 전체 데이터를 학습하고 테스트 데이터 예측
final_model = create_cnn_model()
final_model.fit(
    np.expand_dims(X_train_glove_scaled, axis=-1), y_train,
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=0
)

# 테스트 데이터 예측
y_test_pred_prob_cnn = final_model.predict(X_test_glove_scaled).flatten()
y_test_pred_cnn = (y_test_pred_prob_cnn >= 0.5).astype(int)

# 제출 파일 생성
submission_df = pd.read_csv("sample_submission.csv")
submission_df['target'] = y_test_pred_cnn
submission_df.to_csv("submit_GloVe_1DCNN.csv", index=False)




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training on fold 1...
Fold 1 - Validation Accuracy: 0.7977675795555115
Training on fold 2...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 2 - Validation Accuracy: 0.8003939390182495
Training on fold 3...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 3 - Validation Accuracy: 0.7984241843223572
Training on fold 4...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 4 - Validation Accuracy: 0.7910643815994263
Training on fold 5...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 5 - Validation Accuracy: 0.7739816308021545
Mean K-Fold Validation Accuracy: 0.7923263430595398


  current = self.get_monitor_value(logs)


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
