In [14]:
import nltk
import numpy as np
import pandas as pd

In [15]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [16]:
train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=156)
test_news = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=156)

In [17]:
x_train = train_news.data
y_train = train_news.target

In [18]:
x_test = test_news.data
y_test = test_news.target

In [6]:
# TF-IDF 벡터화

In [7]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), max_df=700)
tfidf_vect.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=700, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [8]:
x_train_tfidf_vect = tfidf_vect.transform(x_train)
x_test_tfidf_vect = tfidf_vect.transform(x_test)

In [9]:
# CNT_VECTOR

In [10]:
cnt_vect = CountVectorizer()
cnt_vect.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [11]:
x_train_cnt_vect = cnt_vect.transform(x_train)
x_test_cnt_vect = cnt_vect.transform(x_test)

In [24]:
line_svd = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('linesvd', LinearSVC())
])

params = { 'tfidf_vect__ngram_range': [(1,1), (1,2), (1,3), (1, 4)],
           'tfidf_vect__max_df': [100, 300, 700],
           'linesvd__C': [1, 5, 10],
           'linesvd__verbose' : [0, 1]
}

grid_line_pipe = GridSearchCV(line_svd, param_grid=params, cv=3 , scoring='accuracy', verbose=1)

In [25]:
grid_line_pipe.fit(x_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed: 78.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf_vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                    

In [21]:
print(grid_line_pipe.best_params_)
print('LinearSVC 의 예측 정확도 : {0:.4f}'.format(grid_line_pipe.best_score_))

{'linesvd__C': 5, 'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)}
LinearSVC 의 예측 정확도 : 0.7709


In [37]:
linear_svc = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('linear', SVC(kernel='linear'))
])

params = { 'tfidf_vect__ngram_range': [(1,1), (1,2), (1,3)],
           'tfidf_vect__max_df': [100, 300, 700],
           'linear__C': [1, 5, 10]
}

grid_cv_pipe = GridSearchCV(linear_svc, param_grid=params,  cv=3 , scoring='accuracy', verbose=1)

In [38]:
grid_cv_pipe.fit(x_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed: 66.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf_vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                    

In [109]:
print(grid_cv_pipe.best_params_)

print('SVC 의 예측 정확도 : {0:.4f}'.format(grid_cv_pipe.best_score_))

{'linesvd__C': 5, 'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)}
SVC 의 예측 정확도 : 0.7626


In [48]:
svc = LinearSVC(C=10, verbose=1)

In [49]:
svc.fit(x_train_tfidf_vect, y_train)

[LibLinear]

LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=1)

In [50]:
svc_pred = svc.predict(x_test_tfidf_vect)

In [51]:
print('{0}의 정확도: {1:.4f}'.format(svc.__class__.__name__, accuracy_score(y_test, svc_pred)))

LinearSVC의 정확도: 0.7053


In [52]:
dt_clf = DecisionTreeClassifier(max_depth=40, min_samples_split=4, min_samples_leaf = 8)

In [53]:
dt_clf.fit(x_train_tfidf_vect, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=40, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [54]:
dt_clf_pred = dt_clf.predict(x_test_tfidf_vect)

In [55]:
print('{0}의 정확도: {1:.4f}'.format(dt_clf.__class__.__name__, accuracy_score(y_test, dt_clf_pred)))

DecisionTreeClassifier의 정확도: 0.3339


In [56]:
rf_clf = RandomForestClassifier(max_depth=40, min_samples_split=4, min_samples_leaf = 8, n_jobs=-1)

In [57]:
rf_clf.fit(x_train_tfidf_vect, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=40, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [58]:
rf_clf_pred = rf_clf.predict(x_test_tfidf_vect)

In [59]:
print('{0}의 정확도: {1:.4f}'.format(rf_clf.__class__.__name__, accuracy_score(y_test, rf_clf_pred)))

RandomForestClassifier의 정확도: 0.5807


In [60]:
sgd_clf = SGDClassifier(penalty='l2', alpha=0.05, loss='log')

In [61]:
sgd_clf.fit(x_train_tfidf_vect, y_train)

SGDClassifier(alpha=0.05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [62]:
sgd_pred = sgd_clf.predict(x_test_tfidf_vect)

In [63]:
print('{0}의 정확도: {1:.4f}'.format(sgd_clf.__class__.__name__, accuracy_score(y_test, sgd_pred)))

SGDClassifier의 정확도: 0.0940


In [121]:
my_dict = {'Model': ['LinearSVC', 'SVC' , 'RandomForest', 'SGDClassifier'],
           'Value':[grid_line_pipe.best_score_, grid_cv_pipe.best_score_, accuracy_score(y_test, rf_clf_pred), accuracy_score(y_test, sgd_pred)]}

In [122]:
df = pd.DataFrame(my_dict)

In [123]:
df

Unnamed: 0,Model,Value
0,LinearSVC,0.770903
1,SVC,0.762596
2,RandomForest,0.580722
3,SGDClassifier,0.093999
