In [None]:
import numpy as np
import pandas as pd
import codecs
import io
import matplotlib.pyplot as plt
import urllib.request
from sklearn.model_selection import train_test_split
import matplotlib.font_manager as fm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from nltk.corpus import stopwords
fm.findSystemFonts()
plt.rcParams['font.family']= ["NanumGothicCoding"]
plt.rcParams["axes.unicode_minus"]=False
# GPU 환경 설정하기
# assert tf.test.is_gpu_available() == True, 'GPU 설정을 확인하세요.'
print(tf.config.list_physical_devices('GPU'))
print(tf.config.list_logical_devices('GPU'))

In [None]:
#스팸의 형식 text-label형식으로 만들면됨
import pandas as pd
spam = pd.read_csv('spam.csv')
print('데이터수:',len(spam))

In [None]:
#spam_test_text의 형식 id-text형식
test_spam = pd.read_csv('spam_test_text.csv')

In [None]:
test_spam = test_spam.dropna(axis=0)
test_spam.isnull().sum()

In [None]:
spam.label = spam.label.apply(lambda x : 0 if x == 'spam' else 1)

In [None]:
data_train = data_train.dropna(axis=0)
data_train.isnull().sum()

In [None]:
spam['label'].value_counts().plot(kind='bar')

In [None]:
print('정상 메일과 스팸 메일의 개수')
print(spam.groupby('label').size().reset_index(name='count'))

In [None]:
X_spam = spam['text']
y_spam = spam['label']
print('메일 본문의 개수: {}'.format(len(X_spam)))
print('레이블의 개수: {}'.format(len(y_spam)))

In [None]:
X_test_spam = test_spam['text']
y_test_spam = test_spam['id']
print('메일 본문의 개수: {}'.format(len(X_test_spam)))
print('레이블의 개수: {}'.format(len(y_test_spam)))

In [None]:
print(f'정상 메일 = {round(y_spam.value_counts()[0]/len(y_spam) * 100,3)}%')
print(f'스팸 메일 = {round(y_spam.value_counts()[1]/len(y_spam) * 100,3)}%')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_spam, y_spam, test_size=0.2, random_state=2022, stratify=y_spam)

In [None]:
print('--------훈련 데이터의 비율-----------')
print(f'정상 메일 = {round(y_train.value_counts()[0]/len(y_train) * 100,3)}%')
print(f'스팸 메일 = {round(y_train.value_counts()[1]/len(y_train) * 100,3)}%')

In [None]:
print('--------테스트 데이터의 비율-----------')
print(f'정상 메일 = {round(y_test.value_counts()[0]/len(y_test) * 100,3)}%')
print(f'스팸 메일 = {round(y_test.value_counts()[1]/len(y_test) * 100,3)}%')

In [None]:
TOP_K = 20000
tokenizer = Tokenizer(num_words=TOP_K)
tokenizer.fit_on_texts(X_train)
X_train_encoded = tokenizer.texts_to_sequences(X_train)
X_test_encoded = tokenizer.texts_to_sequences(X_test_spam)
print(X_train_encoded[:5])

In [None]:
# test_spam['text'] = X_test_encoded
X_train_encoded[:5]

In [None]:
X_train[:5]

In [None]:
test_spam

In [None]:
word_to_index = tokenizer.word_index
print(word_to_index)

In [None]:
threshold = 2
total_cnt = len(word_to_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합(vocabulary)에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

In [None]:
vocab_size = len(word_to_index) + 1
print('단어 집합의 크기: {}'.format((vocab_size)))

In [None]:
print('메일의 최대 길이 : %d' % max(len(sample) for sample in X_train_encoded))
print('메일의 평균 길이 : %f' % (sum(map(len, X_train_encoded))/len(X_train_encoded)))
plt.hist([len(sample) for sample in X_spam], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
max_len = 50
X_train_padded = pad_sequences(X_train_encoded, maxlen = max_len)
X_test_padded = pad_sequences(X_test_encoded, maxlen = max_len)
print("훈련 데이터의 크기(shape):", X_train_padded.shape)
print("테스트 데이터의 크기(shape):", X_test_padded.shape)

In [None]:
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense, LSTM,GRU,Bidirectional
from tensorflow.keras.models import Sequential

embedding_dim = 64
hidden_units = 256

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
# model.add(LSTM(64, return_sequences = True))
# model.add(Bidirectional(LSTM(64, return_sequences = True)))
model.add(GRU(64,return_sequences = True,unroll=False))
model.add(GRU(128,return_sequences = True,unroll=False))
model.add(SimpleRNN(hidden_units,dropout=0.2))
# model.add(LSTM(64, return_sequences = False))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

In [None]:
history = model.fit(X_train_padded, y_train, epochs=20, batch_size=128, validation_split=0.2)

In [None]:
X_test_encoded2 = tokenizer.texts_to_sequences(X_test)
X_test_padded2 = pad_sequences(X_test_encoded2, maxlen = max_len)
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test_padded2, y_test)[1]))

In [None]:
epochs = range(1, len(history.history['acc']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
def maek_submission(pred):  
    pred = np.where(pred>=0.5 ,1, 0)
    pred_Series = pd.Series(pred.reshape(-1))

    pred_Series = np.where(pred== 0 ,'ham', 'spam')

    pred_Series = pred_Series.reshape(-1)

    result = pd.DataFrame({'id' : test_spam['id'],
                           'label' : pred_Series})
    
    return result

In [None]:
pred = model.predict(X_test_padded)

In [None]:
pred=maek_submission(pred)

In [None]:
X_test_padded.shape

In [None]:
test_spam['label']=pred['label']

In [None]:
test_spam[['id','label']].to_csv('/aihub/data/spam_submission.csv',index=False)