# RNN을 이용한 SMS Spam 분류

In [1]:

import numpy as np
import tensorflow as tf

seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### 데이터 전처리

In [6]:
import pandas as pd

df = pd.read_csv('spam.csv', encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [7]:
del df['Unnamed: 2']
del df['Unnamed: 3']
del df['Unnamed: 4']
df['v1'] = df['v1'].replace(['ham', 'spam'], [0, 1])
df.head(3)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   int64 
 1   v2      5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [9]:
# Null 값 확인
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [10]:
# 데이터 중복이 있는지 확인
df.v2.nunique()

5169

In [11]:
# 데이터 중복 제거
df = df.drop_duplicates('v2', keep='first')

In [12]:
# Ham/Spam 데이터 수집 # spam이 1 
df.v1.value_counts()

0    4516
1     653
Name: v1, dtype: int64

In [13]:
X = df.v2.values
y = df.v1.values

In [14]:
X[2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [15]:
# 구두점 제거, 소문자로 변환
from string import punctuation
def preprocessing(s):
    s = s.encode('utf8').decode('ascii', 'ignore')
    return ''.join(c for c in s if c not in punctuation).lower()

In [16]:
X_punct = [preprocessing(x) for x in X]
X_punct[:3]

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s']

In [18]:
# 단어 집합을 만들고, 그 크기를 확인
t = Tokenizer()
t.fit_on_texts(X_punct)
vocab_size = len(t.word_index) + 1
print(f'단어 집합의 크기: {vocab_size}')

단어 집합의 크기: 9480


In [20]:
sequences = t.texts_to_sequences(X_punct)
print(sequences[2])

[54, 508, 8, 22, 4, 959, 960, 2, 217, 2566, 1291, 664, 2567, 2568, 268, 2569, 71, 2566, 2, 2570, 2, 336, 508, 3838, 84, 3839, 424, 3840]


In [22]:
max_len = max(len(l) for l in sequences)
max_len

171

In [25]:
# 전체 데이터셋의 길이를 max_len(171)에 맞도록 padding
data = pad_sequences(sequences, maxlen=max_len)
data.shape

(5169, 171)

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    data, y, stratify=y, test_size=0.2, random_state=seed
)
X_train.shape, X_test.shape

((4135, 171), (1034, 171))

### 모델 정의/설정/학습
- Embedding: 32차원
- SimpleRNN: 32 노드

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [29]:
model = Sequential([ 
    Embedding(vocab_size, 32,input_length=max_len),
    SimpleRNN(32),
    Dense(1, activation='sigmoid')                
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 171, 32)           303360    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 305,473
Trainable params: 305,473
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])