In [29]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
# 이미 라벨링이 되어 있음
# 라벨링의 분포는 고른 편

news_data = fetch_20newsgroups(subset='all', random_state=156)
print(news_data.keys())
print(pd.Series(news_data.target).value_counts().sort_index())
print(news_data.target_names)

In [18]:
# split train & test

train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=156)
test_news = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=156)
X_train = train_news.data
y_train = train_news.target
X_test = test_news.data
y_test = test_news.target

In [21]:
# 문장을 등장한 단어를 기반으로 벡터화한다
# count 가 단순히 카운트로 수행한다면 tfidf 는 공통으로 많이 등장하는 단어는 제거한다
# train data와 test data를 벡터로 변환하는 테이블이 동일해야 함에 주의한다

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [25]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
score = accuracy_score(y_test, pred)
print(score)

0.6736590546999469


In [27]:
# 최적화
# tfidf의 파라미터는 스톱워드, ngram_range, max_df 등이 있다
# logistic 의 파라미터는 feature selection을 위한 C가 있다
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
score = accuracy_score(y_test, pred)



In [None]:
params = {'C':[0.01, 0.1, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
best_C = grid_cv_lr.best_params_
pred = grid_cv_lr.predict(X_test_tfidf_vect)
score = accuracy_score(y_test, pred)

In [None]:
# 사이킷런의 pileline로 벡터화부터 분류까지 한번에 수행하는 파이프라인을 구축할 수 있다

pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression(random_state=156))
])
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
score = accuracy_score(y_test, pred))

In [None]:
params = {'tfidf_vect__ngram_range':[(1,1),(1,2),(1,3)],
          'tfidf_vect__max_df':[0.7, 0.8, 0.9],
          'lr_clf__C':[1,5,10]}
grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_pipe.fit(X_train, y_train)
best_param = grid_cv_pipe.best_params_
score = accuracy_score(y_test, pred)