In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_state=2021)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


# 1. 데이터 탐색

In [3]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [5]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
print(news.data[0])

From: dagibbs@quantum.qnx.com (David Gibbs)
Subject: Re: Countersteering sans Hands
Organization: QNX Software Systems, Ltd.
Lines: 22

In article <1993Apr20.203344.8417@cs.cornell.edu> karr@cs.cornell.edu (David Karr) writes:
>In article <Clarke.6.735328328@bdrc.bd.com> Clarke@bdrc.bd.com (Richard Clarke) writes:
>>So how do I steer when my hands aren't on the bars? (Open Budweiser in left 
>>hand, Camel cigarette in the right, no feet allowed.) 
>
>>If I lean, and the 
>>bike turns, am I countersteering?
>
>No, the bars would turn only *toward* the direction of turn in
>no-hands steering.

Just in case the original poster was looking for a serious answer,
I'll supply one.

Yes, even when steering no hands you do something quite similar
to countersteering.  Basically to turn left, you to a quick wiggle
of the bike to the right first, causing a counteracting lean to
occur to the left.  It is a lot more difficult to do on a motorcycle
than a bicycle though, because of the extra weight. 

# 훈련/데이터용 데이터 추출

In [9]:
train_news = fetch_20newsgroups(
    subset='train', random_state=2021, 
    remove=('headers', 'footers', 'quotes')
)
len(train_news.data)

11314

In [11]:
test_news = fetch_20newsgroups(
    subset='test', random_state=2021, 
    remove=('headers', 'footers', 'quotes')
)
len(test_news.data)

7532

In [26]:
print(train_news.data[1])

]Is it possible to do a "wheelie" on a motorcycle with shaft-drive?

yes.



# 텍스트 데이터에 대해서 전처리

In [27]:
train_df = pd.DataFrame({'article': train_news.data})
test_df = pd.DataFrame({'article': test_news.data})

# Train dataset

In [28]:
# 특수문자 제거
train_df['article'] = train_df.article.str.replace('[^A-Za-z]', ' ')
train_df.article[1]

' Is it possible to do a  wheelie  on a motorcycle with shaft drive   yes  '

In [31]:
# 길이가 3 이하인 단어 제거
train_df['article'] = train_df['article'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
train_df.article[1]

'possible wheelie motorcycle with shaft drive'

In [32]:
# 소문자로 변환
train_df['article'] = train_df['article'].apply(lambda x: x.lower())
train_df.article[1]

'possible wheelie motorcycle with shaft drive'

In [33]:
# 소문자로 변환하고 길이가 3 이하인 단어 제거
train_df['article'] = train_df['article'].apply(lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 3]))

- Test dataset

In [35]:
# 특수문자 제거
test_df['article'] = test_df.article.str.replace('[^A-Za-z]', ' ')
# 소문자로 변환하고 길이가 3 이하인 단어 제거
test_df['article'] = test_df['article'].apply(lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 3]))

# 텍스트 변환

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(train_df.article)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [40]:
X_train = tvect.transform(train_df.article)
X_test = tvect.transform(test_df.article)
X_train.shape, X_test.shape

((11314, 64133), (7532, 64133))

In [41]:
y_train = train_news.target
y_test = test_news.target

# 훈련/예측/평가

In [42]:
from sklearn.svm import SVC 
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [43]:
from sklearn.svm import SVC 
svc = SVC()
svc.fit(X_test, y_test)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [44]:
pred = svc.predict(X_test)

In [45]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9657461497610197