In [9]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train =  fetch_20newsgroups(subset = 'train')
newsgroups_test =  fetch_20newsgroups(subset = 'test')

x_train = newsgroups_train.data
x_test = newsgroups_test.data

y_train = newsgroups_train.target
y_test = newsgroups_test.target

print("20개 카테고리 전체 목록:")
print(newsgroups_train.target_names)
print("\n")
print("샘플 이메일")
print(x_train[0])
print("샘플 타깃 카테고리:")
print(y_train[0])
print(newsgroups_train.target_names[y_train[0]])

20개 카테고리 전체 목록:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


샘플 이메일
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
from nltk import pos_tag
from nltk.stem import PorterStemmer

In [11]:
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
    tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)]
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    tagged_corpus = pos_tag(tokens)
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
    
    lemmatizer = WordNetLemmatizer()
    
    def part_lemmatize(token, tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
        
    pre_proc_text = " ".join([part_lemmatize(token,tag) for token, tag in tagged_corpus])
    
    return pre_proc_text
    

In [19]:
x_train_preprocessed = []
for i in x_train:
    x_train_preprocessed.append(preprocessing(i))

x_test_preprocessed = []
for i in x_test:
    x_test_preprocessed.append(preprocessing(i))

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_words='english', 
                             max_features= 10000,strip_accents='unicode',  norm='l2')

x_train_2 = vectorizer.fit_transform(x_train_preprocessed).todense()
x_test_2 = vectorizer.transform(x_test_preprocessed).todense()


In [21]:
# deep learning module
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from tensorflow.keras.optimizers import Adadelta,Adam,RMSprop
from keras.utils import np_utils

np.random.seed(1337) #랜덤 시드 고정
nb_classes = 20
batch_size = 64
nb_epochs = 20

Y_train = np_utils.to_categorical(y_train, nb_classes)

In [22]:
model = Sequential() # 순차 모델 정의

model.add(Dense(1000,input_shape= (10000,))) # 출력된 뉴련의 수, 입력 차원
model.add(Activation('relu')) 
model.add(Dropout(0.5)) # 노드를 줄여가는 비율

model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(nb_classes))
model.add(Activation('softmax')) #출력층

model.compile(loss='categorical_crossentropy', optimizer='adam') # 손실 함수와 최적화

print (model.summary()) # 모델의 구조 확인


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 1000)              10001000  
                                                                 
 activation_4 (Activation)   (None, 1000)              0         
                                                                 
 dropout_3 (Dropout)         (None, 1000)              0         
                                                                 
 dense_5 (Dense)             (None, 500)               500500    
                                                                 
 activation_5 (Activation)   (None, 500)               0         
                                                                 
 dropout_4 (Dropout)         (None, 500)               0         
                                                                 
 dense_6 (Dense)             (None, 50)               

In [23]:
#  모델 학습 과정
model.fit(x_train_2, Y_train, batch_size=batch_size, epochs=nb_epochs,verbose=1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1dfc7533670>

In [24]:
y_train_predclass = np.argmax(model.predict(x_train_2),axis=1)
y_test_predclass = np.argmax(model.predict(x_test_2), axis =1)

In [25]:
y_train_predclass

array([7, 4, 4, ..., 3, 1, 8], dtype=int64)

In [26]:
y_test_predclass

array([ 4,  9,  0, ...,  9, 12, 15], dtype=int64)

In [27]:
y_train

array([7, 4, 4, ..., 3, 1, 8])

In [29]:
y_test

array([ 7,  5,  0, ...,  9,  6, 15])

In [31]:
from sklearn.metrics import accuracy_score, classification_report

print(('\n\nDeep Neural Network - Train accuracy: '),(round(accuracy_score(y_train,y_train_predclass),3)))
print(('\nDeep Neural Network - Test accuracy:'),(round(accuracy_score(y_test,y_test_predclass),3)))

print('\nDeep Neural Network - Train Classification Report')
print(classification_report(y_train,y_train_predclass))

print('\nDeep Neural Network - Test Classification Report')
print(classification_report(y_test,y_test_predclass))



Deep Neural Network - Train accuracy:  0.999

Deep Neural Network - Test accuracy: 0.807

Deep Neural Network - Train Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       480
           1       1.00      1.00      1.00       584
           2       1.00      1.00      1.00       591
           3       1.00      1.00      1.00       590
           4       1.00      1.00      1.00       578
           5       1.00      1.00      1.00       593
           6       0.99      1.00      1.00       585
           7       1.00      1.00      1.00       594
           8       1.00      1.00      1.00       598
           9       1.00      1.00      1.00       597
          10       1.00      1.00      1.00       600
          11       1.00      1.00      1.00       595
          12       1.00      0.99      1.00       591
          13       1.00      1.00      1.00       594
          14       1.00      1.00      1.00    