In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd /content/gdrive/MyDrive/2021_2학기/기계학습

In [None]:
!pwd

In [None]:
!ls

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_data = pd.read_csv('./train.csv', encoding = 'utf-8')
test_data = pd.read_csv('./test.csv', encoding = 'utf-8')

train_data.drop_duplicates(subset=['mail'], inplace=True)

label_data = train_data['label']
mail_data = train_data['mail']
mail_train, mail_cv, label_train, label_cv = train_test_split(mail_data, label_data, test_size=0.2, random_state=0, stratify=label_data)

test_data.info()
mail_test = test_data['mail']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(mail_train)
mail_train_encoded = tokenizer.texts_to_sequences(mail_train)

word_to_index = tokenizer.word_index
vocab_size = len(word_to_index) + 1

max_len = 2917
mail_train_padded = pad_sequences(mail_train_encoded, maxlen = max_len)

mail_cv_encoded = tokenizer.texts_to_sequences(mail_cv)
mail_cv_padded = pad_sequences(mail_cv_encoded, maxlen = max_len)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 32
dropout_ratio = 0.3
num_filters = 32
kernel_size = 5

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(dropout_ratio))
model.add(Conv1D(num_filters, kernel_size, strides=1, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(dropout_ratio))
model.add(Dense(1, activation='relu'))
model.add(Dropout(dropout_ratio))
model.add(Dense(1, activation='relu'))
model.add(Dropout(dropout_ratio))
model.add(Dense(1, activation='relu'))
model.add(Dropout(dropout_ratio))
model.add(Dense(1, activation='relu'))
model.add(Dropout(dropout_ratio))
model.add(Dense(units=1,activation='sigmoid'))
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

history = model.fit(mail_train_padded, label_train, epochs=100, batch_size=64, callbacks=[es, mc], validation_split=0.2)

print("\n cv acc : %.4f" % (model.evaluate(mail_cv_padded, label_cv)[1]))

mail_test_encoded = tokenizer.texts_to_sequences(mail_test)
max_len_test = max(len(l) for l in mail_test_encoded)
mail_test_padded = pad_sequences(mail_test_encoded, maxlen = max_len_test)
prediction = model.predict(mail_test_padded, batch_size=64)
for idx, p in enumerate(prediction):
    if p > 0.5 :
        prediction[idx] = bool(1)
    else :
        prediction[idx] = bool(0)

submission = pd.read_csv('./result.csv', encoding = 'utf-8')
submission['label'] = prediction
submission[['label']]=submission[['label']].astype(int)
submission.to_csv('./submission_3.csv', index=False)
# submission.info()