# TF-IDF 생성 후 심층 신경망을 이용한 이메일 분류

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [3]:
x_train = newsgroups_train.data
x_test = newsgroups_test.data

y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [4]:
print('List of all 20 categories')
print(newsgroups_train.target_names)
print('\n')
print('Sample Email:')
print(x_train[0])
print('Sample Target Category:')
print(y_train[0])
print(newsgroups_train.target_names[y_train[0]])

List of all 20 categories
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


Sample Email:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is 

In [5]:
# 데이터 전처리에 사용
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
from nltk import pos_tag
from nltk.stem import PorterStemmer

In [None]:
def preprocessing(text):
    # 단어를 분할해 각 문자에 표준 문장부호가 포함되어 있는지 확인
    # 표준 문장부호가 있다면 빈칸으로 변경
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
    
    # 문장을 공백에 따라 단어로 토큰화하고 추가 단계를 적용하기 위한 리스트로 묶는다.
    tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)]
    # 모든 문자를 소문자로 변환해 말뭉치에서 중복을 제거한다.
    tokens = [word.lower() for word in tokens]
    # 불용어 제거
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    # 3글자 이상의 단어들만을 추출
    tokens = [word for word in tokens if len(word)>=3]
    # 단어에서 추가로 접미사가 나오는 단어에 PorterStemmer를 사용해 스테밍을 적용한다.
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # 품사 태깅
    tagged_corpus = pos_tag(tokens)
    
    #pos_tag 함수는 명사에 대한 네가지 형태와 동사의 여섯가지 형태로 품사를 반환한다.
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG',' VBN','VBP','VBZ']
    lemmatizer = WordNetLemmatizer()
    
    #pos_tag