In [66]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset = 'all', random_state = 2021)

In [67]:
import numpy as np
import pandas as pd
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [68]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [69]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [70]:
pd.Series(news.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [71]:
len(news.data)

18846

In [72]:
print(news.data[0])

From: dagibbs@quantum.qnx.com (David Gibbs)
Subject: Re: Countersteering sans Hands
Organization: QNX Software Systems, Ltd.
Lines: 22

In article <1993Apr20.203344.8417@cs.cornell.edu> karr@cs.cornell.edu (David Karr) writes:
>In article <Clarke.6.735328328@bdrc.bd.com> Clarke@bdrc.bd.com (Richard Clarke) writes:
>>So how do I steer when my hands aren't on the bars? (Open Budweiser in left 
>>hand, Camel cigarette in the right, no feet allowed.) 
>
>>If I lean, and the 
>>bike turns, am I countersteering?
>
>No, the bars would turn only *toward* the direction of turn in
>no-hands steering.

Just in case the original poster was looking for a serious answer,
I'll supply one.

Yes, even when steering no hands you do something quite similar
to countersteering.  Basically to turn left, you to a quick wiggle
of the bike to the right first, causing a counteracting lean to
occur to the left.  It is a lot more difficult to do on a motorcycle
than a bicycle though, because of the extra weight. 

In [73]:
# 훈련 / 테스트용 데이터 추출
train_news = fetch_20newsgroups(
    subset = 'train', random_state = 2021, 
    remove = ('headrs', 'footers', 'qoutes')
)
test_news = fetch_20newsgroups(
    subset = 'test', random_state = 2021, 
    remove = ('headrs', 'footers', 'qoutes')
)
len(train_news.data), len(test_news.data)

(11314, 7532)

In [74]:
# 텍스트 데이터에 대해서 전처리
train_df = pd.DataFrame({'article' : train_news.data})
test_df = pd.DataFrame({'article' : test_news.data})

In [75]:
import re
train_df.article = train_df.article.str.replace('[^A-Za-z]',' ')
test_df.article = test_df.article.str.replace('[^A-Za-z]',' ')

  train_df.article = train_df.article.str.replace('[^A-Za-z]',' ')
  test_df.article = test_df.article.str.replace('[^A-Za-z]',' ')


In [76]:
# 길이가 3이하인 단어 제거
train_df['article'] = train_df.article.apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

In [77]:
# 소문자로 변환
train_df['article'] = train_df.article.apply(lambda x: x.lower())

In [78]:
# 소문자로 변환하고 길이가 3이하인 단어 제거
train_df['article'] = train_df.article.apply(lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 3]))
test_df['article'] = test_df.article.apply(lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 3]))

In [83]:
# 텍스트 변환
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words = 'english')
tvect.fit(train_df.article) # 학습

TfidfVectorizer(stop_words='english')

In [84]:
X_train = tvect.transform(train_df.article)
X_test = tvect.transform(test_df.article)
print(X_train.shape, X_test.shape)
y_train = train_news.target
y_test = test_news.target

(11314, 71487) (7532, 71487)


In [80]:
# 훈련 / 예측 / 평가
# Support Vector Machine의 Classifier 사용
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [81]:
pred = svc.predict(X_test)

In [82]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.8041688794476899