In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
# 카테고리 설정

# train set, test set 설정
newsgroups_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'),categories=categories)

newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories)

In [2]:
print('train set size:', len(newsgroups_train.data))
print('test set size:', len(newsgroups_test.data))
print('selected categories:', newsgroups_train.target_names)
print('train labels:', set(newsgroups_train.target))

train set size: 2034
test set size: 1353
selected categories: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
train labels: {0, 1, 2, 3}


array([1, 3, 2, ..., 1, 0, 1], dtype=int64)

In [6]:
print('##Train set text samples:', newsgroups_train.data[0]) #train data 첫번째 요소
print('##Train set label samples:', newsgroups_train.target[0]) # 첫번째 라벨
print('##Test set text samples:', newsgroups_test.data[0]) # test data 첫번째 요소
print('##Test set label samples:', newsgroups_test.target[0]) # 첫번째 라벨

##Train set text samples: Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
##Train set label samples: 1
##Test set text samples: TRry the SKywatch project in  Arizona.
##Test set label samples: 2


In [7]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [9]:
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english") # 불용어 리스트

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(token_pattern= "[a-zA-Z']{3,}", decode_error ='ignore', lowercase=True, stop_words = stopwords.words('english'), max_df=0.5,min_df=2).fit(X_train)
# tfidf 설정
X_train_tfidf = tfidf.transform(X_train) # x_train을 이용하여 tfidf 구함
X_test_tfidf = tfidf.transform(X_test) # x_test를 이용하여 tfidf 구함

print(X_train_tfidf.shape)  # tfidf의 차원

(2034, 11483)


In [12]:
from sklearn.linear_model import LogisticRegression 
clf = LogisticRegression() #분류기 선언
clf.fit(X_train_tfidf, y_train) # train data를 이용하여 분류기를 학습
print('Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train))) # train data에 대한 예측정확도 
print('Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test))) # test data에 대한 예측정확도

Train set score: 0.966
Test set score: 0.761


### Bigram

In [14]:
tfidf = TfidfVectorizer(token_pattern= "[a-zA-Z']{3,}", decode_error ='ignore', lowercase=True, stop_words = stopwords.words('english'),ngram_range=(1, 2),max_df=0.5,min_df=2).fit(X_train)
# Bigram 이기에 ngram_range 를 설정(1,2)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 26550)


In [15]:
bigram_features = [f for f in tfidf.get_feature_names() if len(f.split()) > 1]
# tfidf의 feature 명에서 두개 이상의 단어로 이루어진 것들
print(bigram_features[:10])

["'cause can't", "'em better", "'expected errors'", "'karla' next", "'nodis' password", "'official doctrine", "'ok see", "'sci astro'", "'what's moonbase", 'aas american']


In [16]:
clf = LogisticRegression() #분류기 선언
clf.fit(X_train_tfidf, y_train) # train data를 이용하여 분류기를 학습
print('Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train))) # train data에 대한 예측정확도 
print('Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test))) # test data에 대한 예측정확도

Train set score: 0.969
Test set score: 0.756


### Trigram

In [17]:
tfidf = TfidfVectorizer(token_pattern= "[a-zA-Z']{3,}", decode_error ='ignore', lowercase=True, stop_words = stopwords.words('english'),ngram_range=(1, 3),max_df=0.5,min_df=2).fit(X_train)
# Trigram 이기에 ngram_range 설정(1,3)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 32943)


In [18]:
trigram_features = [f for f in tfidf.get_feature_names() if len(f.split()) > 2]
# 단어 수가 2개 이상으로 이루어진 것들 추출
print(trigram_features[:10])



In [19]:
clf = LogisticRegression() #분류기 선언
clf.fit(X_train_tfidf, y_train) # train data를 이용하여 분류기를 학습
print('Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train))) # train data에 대한 예측정확도 
print('Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test))) # test data에 대한 예측정확도

Train set score: 0.969
Test set score: 0.758


## Ridge

In [20]:
from sklearn.linear_model import RidgeClassifier
ridge_clf = RidgeClassifier() #릿지 분류기 선언
ridge_clf.fit(X_train_tfidf, y_train) #학습
print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train))) # train data에 대한 예측정확도
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test))) # test data에 대한 예측정확도

Train set score: 0.976
Test set score: 0.775


## Lasso

In [21]:
import numpy as np
lasso_clf = LogisticRegression(penalty='l1', solver='liblinear') # Lasso는 동일한 LogisticRegression을 사용하면서 매개변수로 지정
lasso_clf.fit(X_train_tfidf, y_train) # train data로 학습
print('Train set score: {:.3f}'.format(lasso_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(lasso_clf.score(X_test_tfidf, y_test)))
print('Used features count: {}'.format(np.sum(lasso_clf.coef_ != 0)), 'out of', X_train_tfidf.shape[1]) # 유용한 feature 개수

Train set score: 0.761
Test set score: 0.695
Used features count: 246 out of 32943


## SVM

In [None]:
from sklearn.svm import SVC
clf = SVC(gamma='auto', kernel='linear')
clf.fit(X_train_tfidf, y_train) 
print('Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train))) # train data에 대한 예측정확도 
print('Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test))) # test data에 대한 예측정확도