In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, PReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

train = pd.read_csv('/content/drive/My Drive/Machine Learning/open data/train.csv')
test = pd.read_csv('/content/drive/My Drive/Machine Learning/open data/test_x.csv')

print('train 자료의 결측치는 모두 {}개입니다.'.format(sum(train.isnull().sum())))
print('test 자료의 결측치는 모두 {}개입니다.'.format(sum(test.isnull().sum())))

drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE',
             'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
             'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index']

Y = 2 - train['voted']
# 불필요한 column 제거 및 label 분리
X = train.drop(drop_list + ['voted'], axis=1)

replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

# replace_dict에 해당하는 타입으로 변경
X = X.astype(replace_dict)
# 변수 생성
X = pd.get_dummies(X)

# - 부호 문장을 reverse 처리(Domain)
flipping_columns = ["QeA", "QfA", "QkA", "QqA", "QrA"]
for flip in flipping_columns: 
    X[flip] = 6 - X[flip]
    
# - 부호인 secret 문장을 reverse 처리(Domain)
flipping_secret_columns = ["QaA", "QdA", "QgA", "QiA", "QnA"]
for flip in flipping_secret_columns: 
    X[flip] = 6 - X[flip]

# Normalization
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

# train, valid set 분리
x_train, x_val, y_train, y_val = \
train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
test = test.drop(drop_list, axis=1)
test = test.astype(replace_dict)
test = pd.get_dummies(test)

# - 부호 문장을 reverse 처리(Domain)
flipping_columns = ["QeA", "QfA", "QkA", "QqA", "QrA"]
for flip in flipping_columns: 
    test[flip] = 6 - test[flip]
    
# - 부호인 secret 문장을 reverse 처리(Domain)
flipping_secret_columns = ["QaA", "QdA", "QgA", "QiA", "QnA"]
for flip in flipping_secret_columns: 
    test[flip] = 6 - test[flip]
    
scaler.fit(test)
test = scaler.transform(test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
# param_grid = {'max_depth': [10, 15, 20], 
#               'n_estimators': [100, 250, 500, 750,1000],
#               'max_features': ['auto', None]}
# grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
# grid.fit(x_train, y_train)

# print(grid.best_params_)
# print(grid.best_score_)

# rf=grid.best_estimator_
# pred_rf = rf.predict(x_val)
# print(pred_rf)

rf = RandomForestClassifier(max_depth=20, n_estimators=750, max_features='auto')
rf.fit(x_train,y_train)
pred_rf = rf.predict(x_val)
print(pred_rf)

In [None]:
from lightgbm import LGBMClassifier
# lparam_grid_lgbm = {'max_depth': [10, 15, 20],
#                    'learning_rate ': [0.1, 0.01, 0.05 ,0.001], 
#                    'num_boost_round': [100, 250, 500, 750,1000,1500]}
# grid_lgbm = GridSearchCV(LGBMClassifier(), param_grid_lgbm, cv=5)
# grid_lgbm.fit(x_train, y_train)

# print(grid_lgbm.best_params_)
# print(grid_lgbm.best_score_)

# lgbm=grid_lgbm.best_estimator_
# pred_lgbm = lgbm.predict(x_val)
# print(pred_lgbm)

lgbm = LGBMClassifier(max_depth=15, learning_rate=0.1, num_boost_round=100)
lgbm.fit(x_train,y_train)
pred_lgbm = lgbm.predict(x_val)
print(pred_lgbm)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val, pred_rf))
print(classification_report(y_val, pred_lgbm))

In [None]:
!pip install catboost
from catboost import CatBoostClassifier
cbc = CatBoostClassifier()
cbc.fit(x_train, y_train)

pred_cbc = cbc.predict(x_val)
print(pred_cbc)
print(classification_report(y_val, pred_cbc))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# param_grid_gbc = {'max_depth': [5, 7, 10, 15, 20],
#                   'n_estimators': [250, 500, 750,1000, 1500]}

# grid_gbc = GridSearchCV(GradientBoostingClassifier(), param_grid_gbc, cv=5)
# grid_gbc.fit(x_train, y_train)
# print(grid_gbc.best_params_)
# print(grid_gbc.best_score_)

# gbc=grid_gbc.best_estimator_
# pred_gbc = gbc.predict(x_val)
# print(pred_gbc)
# print(classification_report(y_val, pred_gbc))

gbc = GradientBoostingClassifier(max_depth=5, n_estimators=250)
gbc.fit(x_train,y_train)
pred_gbc = gbc.predict(x_val)
print(pred_gbc)
print(classification_report(y_val, pred_gbc))

In [None]:
from tensorflow.keras.metrics import AUC
def build_nn():
    model = Sequential()
    model.add(Dense(2048, kernel_initializer='he_uniform',
                    input_shape=(x_train.shape[1],)))  # input_shape로 input layer의 역할까지도 처리
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.7))

    model.add(Dense(1024, kernel_initializer='he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.7))

    model.add(Dense(512, kernel_initializer='he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.7))

    model.add(Dense(1, activation='sigmoid', kernel_initializer='he_normal'))

    model.compile(optimizer=Adam(learning_rate=0.0035),
                  loss='binary_crossentropy',   # sparse를 붙여주면 one-hot encoing 작업을 생략 가능
                  metrics=['binary_accuracy',AUC()])

    return model

In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
DNN_clf = KerasClassifier(build_fn=build_nn, epochs=35)
DNN_clf._estimator_type="classifier"

In [None]:
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier(estimators=[('rf', rf),('lgbm', lgbm),('gbc', gbc),('DNN', DNN_clf)], voting='soft')
voting.fit(x_train, y_train)
pred_voting = voting.predict(x_val)
print(pred_voting)

In [None]:
prob = voting.predict_proba(x_val)
score = prob[:,1]
print(score)

In [None]:
from sklearn.metrics import roc_auc_score
# voting_result = voting.predict(test)
# print(voting_result)
print(classification_report(y_val, pred_voting))
print(roc_auc_score(y_val, score))

In [None]:
test_prob = voting.predict_proba(test)
submission_prob = 2-test_prob[:,1]
print(submission_prob)

In [None]:
submission = pd.read_csv('/content/drive/My Drive/Machine Learning/open data/submission.csv')
submission['voted'] = submission_prob
submission.to_csv('/content/drive/My Drive/Machine Learning/open data/submission_1112_colab_7782.csv', index=False)