In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd /content/gdrive/MyDrive/2021_2학기/기계학습

In [None]:
!pwd

In [None]:
!ls

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_data = pd.read_csv('./data/train.csv', encoding = 'utf-8')
test_data = pd.read_csv('./data/test.csv', encoding = 'utf-8')
train_data.drop_duplicates(subset=['mail'], inplace=True)

label_data = train_data['label']
mail_data = train_data['mail']
mail_test = test_data['mail']
mail_train, mail_cv, label_train, label_cv = train_test_split(mail_data, label_data, test_size=0.2, random_state=0, stratify=label_data)

######
def vectorize_sequences(sequences, dimension=41000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results
#####

tokenizer = Tokenizer()
tokenizer.fit_on_texts(mail_train)
mail_train_encoded = tokenizer.texts_to_sequences(mail_train)
word_to_index = tokenizer.word_index
vocab_size = len(word_to_index) + 1
max_len = max(len(l) for l in mail_train_encoded)
# mail_train_padded = pad_sequences(mail_train_encoded, maxlen = max_len)
#####
mail_train_padded = vectorize_sequences(mail_train_encoded, 41000)
#####

mail_cv_encoded = tokenizer.texts_to_sequences(mail_cv)
# mail_cv_padded = pad_sequences(mail_cv_encoded, maxlen = max_len)
#####
mail_cv_padded = vectorize_sequences(mail_cv_encoded, 41000)
#####

mail_test_encoded = tokenizer.texts_to_sequences(mail_test)
max_len_test = max(len(l) for l in mail_test_encoded)
# mail_test_padded = pad_sequences(mail_test_encoded, maxlen = max_len_test)
#####
mail_test_padded = vectorize_sequences(mail_test_encoded, 41000)
#####

In [None]:
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

##### model CNN
# embedding_dim = 32
# dropout_ratio = 0.5
# num_filters = 32
# kernel_size = 5

# model = Sequential()
# model.add(Embedding(vocab_size, embedding_dim))
# model.add(Dropout(dropout_ratio))
# model.add(Conv1D(num_filters, kernel_size, strides=1, padding='valid', activation='relu'))
# model.add(GlobalMaxPooling1D())
# model.add(Dropout(dropout_ratio))
# model.add(Dense(1, activation='sigmoid'))
# model.summary()
#####
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
##### model neural network
model = Sequential()
model.add(Dense(8, activation='relu', input_shape=(41000,)))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),loss='binary_crossentropy',metrics=['acc', f1_m, precision_m, recall_m])
# rmsprop, adam

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
mc = ModelCheckpoint('best_model.h5', monitor='val_f1_m', mode='max', verbose=1, save_best_only=True)

history = model.fit(mail_train_padded, label_train, epochs=100, batch_size=64, callbacks=[es, mc], validation_split=0.23)
# validation_split=0.2

loss, accuracy, f1_score, precision, recall = model.evaluate(mail_cv_padded, label_cv, verbose=0)
print('loss: ')
print(loss)
print('accuracy: ')
print(accuracy)
print('f1_score: ')
print(f1_score)
print('precision: ')
print(precision)
print('recall: ')
print(recall)

print("\n cv acc : %.4f" % (model.evaluate(mail_cv_padded, label_cv, batch_size=64)[1]))
#batch_size=100

prediction = model.predict(mail_test_padded, batch_size=64)
for idx, p in enumerate(prediction):
    if p > 0.6 :
        prediction[idx] = bool(1)
    else :
        prediction[idx] = bool(0)

submission = pd.read_csv('./result.csv', encoding = 'utf-8')
submission['label'] = prediction
submission[['label']]=submission[['label']].astype(int)
submission.to_csv('./submission_FC1.csv', index=False)
submission.info()

epochs = range(1, len(history.history['acc']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

epochs = range(1, len(history.history['acc']) + 1)
plt.plot(epochs, history.history['acc'])
plt.plot(epochs, history.history['val_acc'])
plt.title('model acc')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# from sklearn.metrics import classification_report
# print(classification_report(label_train, pred))