In [3]:
# 필요한 패키지 불러오기
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# 데이터 로드
train_df = pd.read_csv('train_for_NLP.csv')
test_df = pd.read_csv('test_cleaned.csv')

# train 데이터에서 특성과 레이블 분리
X_train = train_df['combined_str']
y_train = train_df['target']

# test 데이터의 특성
X_test = test_df['combined_str']

# CountVectorizer를 사용하여 문서 벡터화
vectorizer = CountVectorizer(max_features=5000)  # 최대 5000개의 특징을 사용
X_train_bow = vectorizer.fit_transform(X_train).toarray()
X_test_bow = vectorizer.transform(X_test).toarray()

# 스케일링
scaler = StandardScaler()
X_train_bow_scaled = scaler.fit_transform(X_train_bow)
X_test_bow_scaled = scaler.transform(X_test_bow)

# 데이터를 학습과 검증용으로 분리
X_train_bow_split, X_val_bow_split, y_train_split, y_val_split = train_test_split(X_train_bow_scaled, y_train, test_size=0.2, random_state=42)

# CNN 입력에 맞게 데이터 차원 변경 (1D CNN에서는 입력이 3차원이어야 함)
X_train_bow_split = np.expand_dims(X_train_bow_split, axis=-1)
X_val_bow_split = np.expand_dims(X_val_bow_split, axis=-1)
X_test_bow_scaled = np.expand_dims(X_test_bow_scaled, axis=-1)

# 1D CNN 모델 구축
def create_cnn_model(input_shape):
    model = tf.keras.models.Sequential()

    # 1D Convolutional Layer
    model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
    
    # MaxPooling Layer
    model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
    
    # Flatten Layer
    model.add(tf.keras.layers.Flatten())
    
    # Dense Layer
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    
    # Output Layer (이진 분류)
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    # Compile Model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# 모델 생성
input_shape = (X_train_bow_split.shape[1], 1)  # (sequence_length, features)
cnn_model = create_cnn_model(input_shape)

# 모델 학습
history = cnn_model.fit(
    X_train_bow_split,
    y_train_split,
    validation_data=(X_val_bow_split, y_val_split),
    epochs=1,
    batch_size=32
)

# 테스트 데이터 예측
y_test_pred_prob_cnn = cnn_model.predict(X_test_bow_scaled).flatten()
y_test_pred_cnn = (y_test_pred_prob_cnn >= 0.5).astype(int)

# 제출 파일 생성
submission_df = pd.read_csv("sample_submission.csv")
submission_df['target'] = y_test_pred_cnn
submission_df.to_csv("submit_CountVectorizer_1DCNN.csv", index=False)


