In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd /content/gdrive/MyDrive/2021_2학기/기계학습

In [None]:
!pwd

In [None]:
!ls

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train_data = pd.read_csv('./train.csv', encoding = 'latin1')
print('train sample 수 :', len(train_data))
test_data = pd.read_csv('./test.csv', encoding = 'latin1')
print('test sample 수 :', len(test_data))

In [None]:
train_data[:5]

In [None]:
test_data[:5]

In [None]:
train_data.info()
print('train Null sample : ', train_data.isnull().values.any())
print('train is unique subject :', train_data['mail'].nunique())
# drop duplicates
train_data.drop_duplicates(subset=['mail'], inplace=True)
print('train sample number by dropping dup : ', len(train_data))

test_data.info()
print('test Null sample : ', test_data.isnull().values.any())
print('test is unique subject :', test_data['mail'].nunique())
# drop duplicates
test_data.drop_duplicates(subset=['mail'], inplace=True)
print('test sample number by dropping dup : ', len(test_data))

In [None]:
train_data['label'].value_counts().plot(kind='bar')
print(train_data.groupby('label').size().reset_index(name='count'))
print(f'ham prop = {round(train_data["label"].value_counts()[0]/len(train_data) * 100,3)}%')
print(f'spam prop = {round(train_data["label"].value_counts()[1]/len(train_data) * 100,3)}%')

In [None]:
label_data = train_data['label']
mail_data = train_data['mail']
mail_train, mail_cv, label_train, label_cv = train_test_split(mail_data, label_data, test_size=0.2, random_state=0, stratify=label_data)
print('************* train vs cv **************')
print(f'train ham = {round(label_train.value_counts()[0]/len(label_train)*100,3)}%')
print(f'train spam = {round(label_train.value_counts()[1]/len(label_train)*100,3)}%')

print(f'cv ham = {round(label_cv.value_counts()[0]/len(label_cv)*100,3)}%')
print(f'cv spam = {round(label_cv.value_counts()[1]/len(label_cv)*100,3)}%')

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(mail_train)
mail_train_encoded = tokenizer.texts_to_sequences(mail_train)

In [None]:
print(mail_train_encoded[:5])

In [None]:
word_to_index = tokenizer.word_index
print(word_to_index)

In [None]:
vocab_size = len(word_to_index) + 1
print('word size: {}'.format((vocab_size)))

In [None]:
print('메일의 최대 길이 : %d' % max(len(l) for l in mail_train_encoded))
print('메일의 평균 길이 : %f' % (sum(map(len, mail_train_encoded))/len(mail_train_encoded)))
plt.hist([len(s) for s in mail_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
max_len = 2917
mail_train_padded = pad_sequences(mail_train_encoded, maxlen = max_len)
print("train data shape: ", mail_train_padded.shape)

In [None]:
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
from tensorflow.keras.models import Sequential

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(mail_train_padded, label_train, epochs=6, batch_size=64, validation_split=0.2)

In [None]:
mail_cv_encoded = tokenizer.texts_to_sequences(mail_cv)
mail_cv_padded = pad_sequences(mail_cv_encoded, maxlen = max_len)
print("\n cv acc : %.4f" % (model.evaluate(mail_cv_padded, label_cv)[1]))

In [None]:
epochs = range(1, len(history.history['acc']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()