# TF-IDF 생성 후 심층 신경망을 이용한 이메일 분류

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [3]:
x_train = newsgroups_train.data
x_test = newsgroups_test.data

y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [4]:
print('List of all 20 categories')
print(newsgroups_train.target_names)
print('\n')
print('Sample Email:')
print(x_train[0])
print('Sample Target Category:')
print(y_train[0])
print(newsgroups_train.target_names[y_train[0]])

List of all 20 categories
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


Sample Email:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is 

In [5]:
# 데이터 전처리에 사용
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
from nltk import pos_tag
from nltk.stem import PorterStemmer

In [6]:
def preprocessing(text):
    # 단어를 분할해 각 문자에 표준 문장부호가 포함되어 있는지 확인
    # 표준 문장부호가 있다면 빈칸으로 변경
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
    
    # 문장을 공백에 따라 단어로 토큰화하고 추가 단계를 적용하기 위한 리스트로 묶는다.
    tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)]
    # 모든 문자를 소문자로 변환해 말뭉치에서 중복을 제거한다.
    tokens = [word.lower() for word in tokens]
    # 불용어 제거
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    # 3글자 이상의 단어들만을 추출
    tokens = [word for word in tokens if len(word)>=3]
    # 단어에서 추가로 접미사가 나오는 단어에 PorterStemmer를 사용해 스테밍을 적용한다.
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # 품사 태깅
    tagged_corpus = pos_tag(tokens)
    
    #pos_tag 함수는 명사에 대한 네가지 형태와 동사의 여섯가지 형태로 품사를 반환한다.
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG',' VBN','VBP','VBZ']
    lemmatizer = WordNetLemmatizer()
    
    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
        
    pre_proc_text = " ".join([prat_lemmatize(token,tag) for token, tag in tagged_corpus])
    
    return pre_proc_text

In [7]:
x_train_preprocessed = []
for i in x_train:
    x_train_preprocessed.append(preprocessing(i))
    
x_test_preproceesed = []
for i in x_test:
    x_test_preproceesed.append(preprocessing(i))
    
# TFIDF 벡터라이저(vectorizer) 구축
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),stop_words='english', max_features= 10000,strip_accents='unicode',norm='l2')
x_train_2 = vectorizer.fit_transform(x_train_preprocessed).todense()
x_test_2 = vectorizer.transform(x_test_preproceesed).todense()

In [8]:
len(x_train_preprocessed[0]), len(x_train_preprocessed[1])

(391, 566)

In [10]:
x_train_preprocessed[0]

'lerxst wam umd edu thing subject car nntp post host rac3 wam umd edu organ univers maryland colleg park line wonder anyon could enlighten car saw day door sport car look late 60 earli 70 call bricklin door realli small addit front bumper separ rest bodi know anyon tellm model name engin spec year product car make histori whatev info funki look car plea mail thank bring neighborhood lerxst'

In [11]:
x_train_2[1]

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# 딥러닝 모듈
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adadelta,Adam,RMSprop
from keras.utils import np_utils

In [17]:
# 하이퍼 파라미터 정의
np.random.seed(1337)
nb_classes = 20
batch_size = 64
nb_epochs = 20

In [18]:
Y_train = np_utils.to_categorical(y_train, nb_classes)

In [20]:
Y_train[0]

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.], dtype=float32)

In [22]:
# 케라스에서의 딥 레이어(심층) 모델 구축
model = Sequential()

model.add(Dense(1000,input_shape = (10000,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer = 'adam')

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 1000)              10001000  
_________________________________________________________________
activation_4 (Activation)    (None, 1000)              0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 500)               500500    
_________________________________________________________________
activation_5 (Activation)    (None, 500)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)               

In [23]:
# 모델 학습
model.fit(x_train_2, Y_train, batch_size=batch_size, epochs = nb_epochs, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fecd73ccaf0>

In [24]:
# 모델 예측
y_train_predclass = model.predict_classes(x_train_2,batch_size=batch_size)
y_test_predclass = model.predict_classes(x_test_2,batch_size=batch_size)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [25]:
from sklearn.metrics import accuracy_score,classification_report
print ("\n\nDeep Neural Network - Train accuracy:"),(round(accuracy_score(y_train,y_train_predclass),3))
print ("\nDeep Neural Network - Test accuracy:"),(round(accuracy_score(y_test,y_test_predclass),3))
print ("\nDeep Neural Network - Train Classification Report")
print (classification_report(y_train,y_train_predclass))
print ("\nDeep Neural Network - Test Classification Report")
print (classification_report(y_test,y_test_predclass))



Deep Neural Network - Train accuracy:

Deep Neural Network - Test accuracy:

Deep Neural Network - Train Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       480
           1       1.00      1.00      1.00       584
           2       1.00      1.00      1.00       591
           3       1.00      1.00      1.00       590
           4       1.00      1.00      1.00       578
           5       1.00      1.00      1.00       593
           6       1.00      1.00      1.00       585
           7       1.00      1.00      1.00       594
           8       1.00      1.00      1.00       598
           9       1.00      1.00      1.00       597
          10       1.00      1.00      1.00       600
          11       1.00      1.00      1.00       595
          12       1.00      1.00      1.00       591
          13       1.00      1.00      1.00       594
          14       1.00      1.00      1.00       593
      