# 20 뉴스그룹 분류

In [1]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_state=2021)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


## 데이터 탐색

In [4]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [7]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [9]:
pd.Series(news.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [10]:
len(news.data)

18846

In [26]:
print(news.data[1])

From: kehoe@netcom.com (Thomas David Kehoe)
Subject: Re: How starters work really
Keywords: fluorescent bulb starter neon
Organization: Netcom - Online Communication Services (408 241-9760 guest)
Lines: 35

>>So when you turn on the power, this causes the bulb to work like a neon, 
>
>Imprecise. This description
>
> 1. ignores the role of the ballast,
> 2. misrepresents the heating effects in the starter.
>
>The bimetalic strip cools down immediately after the contacts

I've been thinking of sending into Mad magazine an idea for a 
parody, of those books entitled "How Things Work" that
engineers buy their sons, which explain how engines, elevators,
flourescent lights, etc. work.

The parody would be "How Things Really Work."  Under "Canned
Food", on the left page you'd see the description from 
"How Things Work": gleaming stainless steel equipment
pasteurizing the food to precisely the right temperature,
then sealing the can in an oxygen-free environment, etc.

On the right page you'd 

## 훈련/테스트용 데이터 추출

In [27]:
train_news = fetch_20newsgroups(
    subset = 'train', random_state=2021,
    remove=('headers', 'footers', 'quotes')
)
len(train_news.data)

11314

In [28]:
test_news = fetch_20newsgroups(
    subset = 'test', random_state=2021,
    remove=('headers', 'footers', 'quotes')
)
len(test_news.data)

7532

In [29]:
print(train_news.data[1])

]Is it possible to do a "wheelie" on a motorcycle with shaft-drive?

yes.



## 텍스트 데이터에 대해서 전처리

In [30]:
train_df = pd.DataFrame({'article': train_news.data})
test_df = pd.DataFrame({'article': test_news.data})

- Train dataset


In [32]:
# 특수문자 제거
train_df['article'] = train_df.article.str.replace('[^A-Za-z]', ' ')
train_df.article[1]

' Is it possible to do a  wheelie  on a motorcycle with shaft drive   yes  '

In [37]:
# 길이가 3 이하인 단어 제거
train_df['article'] = train_df.article.apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
train_df.article[1]

'possible wheelie motorcycle with shaft drive'

In [34]:
s = 'a ab abc abcd abcde'
x =[]
for w in s.split():
    if len(w) > 3:
        x.append(w)
s = ' '.join([w for w in s.split() if len(w) > 3])
s

'abcd abcde'

In [38]:
# 소문자로 변환
train_df['article'] = train_df.article.apply(lambda x: x.lower())

- 테스트 데이터 셋


In [39]:
# 특수문자 제거
test_df['article'] = test_df.article.str.replace('[^A-Za-z]', ' ')
# 소문자로 변환하고 길이가 3 이하인 단어 제거
test_df['article'] = test_df.article.apply(lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 3]))

## 텍스트 변환

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(train_df.article)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [42]:
X_train = tvect.transform(train_df.article)
X_test = tvect.transform(test_df.article)
X_train.shape, X_test.shape

((11314, 64529), (7532, 64529))

In [43]:
y_train = train_news.target
y_test = test_news.target

## 훈련/예측/평가

In [44]:
# Support Vector Machine의 Classifier 사용
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [45]:
pred = svc.predict(X_test)

In [46]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.6501593202336696