In [1]:
#!pip install tensorflow

In [2]:
import pandas as pd
import numpy as np
import keras
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.preprocessing import TargetEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from scikeras.wrappers import KerasClassifier

2024-07-25 00:57:45.396864: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Read and split the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [6]:
#train = train_origin.sample(frac=0.01, random_state = 42).set_index('id').astype(str)
train = train_origin.set_index('id').astype(str)

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response']

# 훈련 세트와 테스트 세트로 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 전체 변수 타겟 인코딩
enc = TargetEncoder()
X_train = pd.DataFrame(enc.fit_transform(X_train, y_train), 
                       index=X_train.index, columns=X_train.columns)
X_valid = pd.DataFrame(enc.transform(X_valid), index=X_valid.index, 
                      columns=X_valid.columns)

In [None]:
# 모델 정의 함수
def get_model2(meta):
    model = keras.models.Sequential()
    model.add(keras.layers.Input(meta["X_shape_"][1:]))
    model.add(keras.layers.Dense(64, kernel_initializer='he_normal', activation='relu'))
    model.add(keras.layers.BatchNormalization()),
    model.add(keras.layers.Dense(128, kernel_initializer='he_normal', activation='relu'))
    model.add(keras.layers.BatchNormalization()),
    model.add(keras.layers.Dense(256, kernel_initializer='he_normal', activation='relu'))
    model.add(keras.layers.BatchNormalization()),
    model.add(keras.layers.Dense(128, kernel_initializer='he_normal', activation='relu'))
    model.add(keras.layers.BatchNormalization()),
    model.add(keras.layers.Dense(32, kernel_initializer='he_normal', activation='relu'))
    model.add(keras.layers.Dense(1, kernel_initializer='he_normal', activation='sigmoid'))
    
    return model

# 학습률 스케줄러 함수 정의
def lr_scheduler(epoch, lr):
    decay_rate = 0.96
    decay_step = 10
    new_lr = lr * (decay_rate ** (epoch // decay_step))
    print(f"Epoch {epoch}: Learning rate is {new_lr}")
    return new_lr

# LearningRateScheduler 콜백 설정
lr_schedule = LearningRateScheduler(lr_scheduler)

keras_model = KerasClassifier(
    get_model2,
    loss="binary_crossentropy",
    optimizer=keras.optimizers.AdamW(learning_rate=2e-03),
    validation_split=0.05,
    batch_size=128,
    validation_batch_size=65536,
    epochs=30,  # Increase the number of epochs for further training
    callbacks=[lr_schedule, keras.callbacks.EarlyStopping(patience=10)]
)

In [7]:
# Execute the pipeline and measure the auc score
keras_model.fit(X_train, y_train)

y_pred = keras_model.predict_proba(X_valid)[:, 1]
print(f"# AUC: {roc_auc_score(y_valid, y_pred):.5f}")

Epoch 0: Learning rate is 0.0020000000949949026
Epoch 1/30
[1m68310/68310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m670s[0m 10ms/step - loss: 0.2539 - val_loss: 0.2513 - learning_rate: 0.0020
Epoch 1: Learning rate is 0.0020000000949949026
Epoch 2/30
[1m68310/68310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m578s[0m 8ms/step - loss: 0.2515 - val_loss: 0.2510 - learning_rate: 0.0020
Epoch 2: Learning rate is 0.0020000000949949026
Epoch 3/30
[1m68310/68310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m547s[0m 8ms/step - loss: 0.2514 - val_loss: 0.2512 - learning_rate: 0.0020
Epoch 3: Learning rate is 0.0020000000949949026
Epoch 4/30
[1m68310/68310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m671s[0m 10ms/step - loss: 0.2509 - val_loss: 0.2513 - learning_rate: 0.0020
Epoch 4: Learning rate is 0.0020000000949949026
Epoch 5/30
[1m68310/68310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m706s[0m 10ms/step - loss: 0.2511 - val_loss: 0.2519 - learning_rate: 0.0020
Epoch 5

In [13]:
#test 데이터 로드
test_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/test.csv')

#인덱스 제외
X_test = test_origin.set_index('id').astype(str)

# 변수 타겟 인코딩
X_test = pd.DataFrame(enc.transform(X_test), index=X_test.index, 
                      columns=X_test.columns)

# 예측 생성
y_test_pred = keras_model.predict_proba(X_test)[:,1]

# 'id'와 'Response' 열이 있는 DataFrame 생성
submission = pd.DataFrame({'id': X_test.index, 'Response': y_test_pred})
print(submission.head())

# 예측을 CSV 파일로 저장
submission .to_csv('keras_predictions.csv', index=False)
print("Predictions saved to 'keras_predictions.csv'")

[1m59921/59921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 4ms/step
         id  Response
0  11504798  0.003773
1  11504799  0.556605
2  11504800  0.261816
3  11504801  0.000034
4  11504802  0.137380
Predictions saved to 'keras_predictions.csv'


In [20]:
#sample_submission 데이터 로드
sample = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/sample_submission.csv')
sample.head()


Unnamed: 0,id,Response
0,11504798,0.5
1,11504799,0.5
2,11504800,0.5
3,11504801,0.5
4,11504802,0.5
