# 뉴스 기사 텍스트 분석

## 1. 데이터 내려받기

In [3]:
from sklearn.datasets import fetch_20newsgroups # 뉴스 데이터셋

# 데이터를 내려받은 후에 메모리에 데이터를 로딩
news_data = fetch_20newsgroups(subset = "all", random_state = 156)
print(news_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [5]:
import numpy as np
import pandas as pd

In [10]:
print("target 클래스 값과 분포도 \n", pd.Series(news_data.target).value_counts().sort_index())
print("target 클래스의 이름들 \n", news_data.target_names)

target 클래스 값과 분포도 
 0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64
target 클래스의 이름들 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [14]:
print(news_data.data[1])
# 뉴스 헤더, 이메일, 내용 등 다양한 데이터들 포함
# 제목, 소속, 이메일 등의 헤더와 푸터 정보들은 target값과 연관성이 높기 때문에 헤더를 포함하면 정확도가 너무 높게 나옴

From: jlevine@rd.hydro.on.ca (Jody Levine)
Subject: Re: insect impacts
Organization: Ontario Hydro - Research Division
Lines: 64

I feel childish.

In article <1ppvds$92a@seven-up.East.Sun.COM> egreen@East.Sun.COM writes:
>In article 7290@rd.hydro.on.ca, jlevine@rd.hydro.on.ca (Jody Levine) writes:
>>>>
>>>>how _do_ the helmetless do it?
>>>
>>>Um, the same way people do it on 
>>>horseback
>>
>>not as fast, and they would probably enjoy eating bugs, anyway
>
>Every bit as fast as a dirtbike, in the right terrain.  And we eat
>flies, thank you.

Who mentioned dirtbikes? We're talking highway speeds here. If you go 70mph
on your dirtbike then feel free to contribute.

>>>jeeps
>>
>>you're *supposed* to keep the windscreen up
>
>then why does it go down?

Because it wouldn't be a Jeep if it didn't. A friend of mine just bought one
becuase it's a big 4 wheeler). Anyway, it's written in about ten places that
the windshield should remain up at all times, and it looks like they've made
it a 

In [4]:
# subset = "train"으로 학습용 데이터만 추출, remove = ("headers","footers", "quotes")
train_news = fetch_20newsgroups(subset="train", remove = ("headers","footers","quotes"),
                  random_state = 156)

X_train = train_news.data
y_train = train_news.target

In [5]:
# subset = "test"으로 학습용 데이터만 추출, remove = ("headers","footers", "quotes")
test_news = fetch_20newsgroups(subset="test", remove = ("headers","footers","quotes"),
                  random_state = 156)

X_test = test_news.data
y_test = test_news.target

In [20]:
print(f"학습 데이터 크기 {len(X_train)}, 테스트 데이터 크기 {len(X_test)}")

학습 데이터 크기 11314, 테스트 데이터 크기 7532


## 2. 피터 벡터화 변환과 머신러닝 모델 학습/예측/평가

#### count 기반

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorization으로 피처 벡터화 변환 수행
cnt_vect = CountVectorizer() # 문장에서 단어가 나타난 횟수로
cnt_vect.fit(X_train) # test 시에는 fit_transform 을 사용하면 안됨

# count vector 형식으로 변환
X_train_cnt_vect = cnt_vect.transform(X_train) 
X_test_cnt_vect = cnt_vect.transform(X_test)
# 문서개수 : 11314, 단어개수 : 101631, COO 형식의 행렬
print("학습 데이터 텍스트의 CountVectorizer Shape:", X_train_cnt_vect.shape)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Logistic Regression을 이용하여 학습/예측/평가 수행
lr_clf = LogisticRegression()
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print(accuracy_score(y_test, pred))

학습 데이터 텍스트의 CountVectorizer Shape: (11314, 101631)
0.6074083908656399


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### tf-idf 기반

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tf-idf으로 피처 벡터화 변환 수행
tfidf_vect = TfidfVectorizer() # 문장에서 단어가 나타난 횟수로
tfidf_vect.fit(X_train) # test 시에는 fit_transform 을 사용하면 안됨

# tf-idf vector 형식으로 변환
X_train_tfidf_vect = tfidf_vect.transform(X_train) 
X_test_tfidf_vect = tfidf_vect.transform(X_test)
# 문서개수 : 11314, 단어개수 : 101631, COO 형식의 행렬
print("학습 데이터 텍스트의 tfidf Shape:", X_train_tfidf_vect.shape)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Logistic Regression을 이용하여 학습/예측/평가 수행
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print(accuracy_score(y_test, pred))

학습 데이터 텍스트의 tfidf Shape: (11314, 101631)
0.6736590546999469


#### 다양한 파라미터를 적용

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tf-idf으로 피처 벡터화 변환 수행
# stopwords제거 및 총 등장횟수가 300번을 초과하는 단어 삭제
tfidf_vect = TfidfVectorizer(stop_words="english", max_df=300) 
tfidf_vect.fit(X_train) 

# tf-idf vector 형식으로 변환
X_train_tfidf_vect = tfidf_vect.transform(X_train) 
X_test_tfidf_vect = tfidf_vect.transform(X_test)

# Logistic Regression을 이용하여 학습/예측/평가 수행
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print(accuracy_score(y_test, pred))

0.6903876792352629


#### GridSearchCV 적용 

In [11]:
from sklearn.model_selection import GridSearchCV

# 최적 C값
params = {"C":[0.01, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring="accuracy", verbose=1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print(grid_cv_lr.best_params_)

# 최적 C값으로 예측, 정확도 평가
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print(accuracy_score(y_test, pred))

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative sol

{'C': 5}
0.6872012745618693


#### 파이프라인 사용

In [12]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("tfidf_vect", TfidfVectorizer(stop_words="english", max_df=300)),
    ("lr_clf", LogisticRegression(C=10))
])

# fit -> 전처리 함수는 fit_transform으로, 예측함수는 fit으로 수행
pipeline.fit(X_train, y_train)
# predict -> 전처리 함수는 transform으로, 예측함수는 predict로 수해
pred = pipeline.predict(X_test)
print(accuracy_score(y_test, pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.6865374402549124


#### 파이프라인 & Grid search

In [13]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("tfidf_vect", TfidfVectorizer(stop_words="english", max_df=300)),
    ("lr_clf", LogisticRegression(C=10))
])

# 각 모델명__하이퍼파라미터 로 파라미터를 설정해 주어야 함.
params = {"tfidf_vect__max_df":[100,300],
         "lr_clf__C":[5,10]}

grid_cv_lr = GridSearchCV(pipeline, param_grid=params, cv=3, scoring="accuracy", verbose=1)
grid_cv_lr.fit(X_train, y_train)
print(grid_cv_lr.best_params_, grid_cv_lr.best_score_)

# 최적 C값으로 예측, 정확도 평가
pred = grid_cv_lr.predict(X_test)
print(accuracy_score(y_test, pred))

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative sol

{'lr_clf__C': 10, 'tfidf_vect__max_df': 300} 0.7482768582657983
0.6865374402549124
