# Header Import

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

# Dataset Load

In [2]:
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

# Clean Data function

In [3]:
def letters_only(astr):
    for c in astr:
        if not c.isalpha():
            return False
    return True

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        # 이름 지우기 lemmatizer 적용
        cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower())
                                      for word in doc.split()
                                      if letters_only(word)
                                      and word not in all_names]))
    return cleaned_docs


# Binary classification

In [4]:
categories = ['comp.graphics', 'sci.space']

data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)

cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_test.data)
label_test = data_test.target

from collections import Counter
Counter(label_train)

Counter({0: 584, 1: 593})

In [12]:
label_train

array([0, 1, 0, ..., 0, 0, 1])

#TF-IDF

In [15]:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)
print(term_docs_train)
# 1176개의 점 8000 개의 feature

  (0, 3)	0.1943234373521608
  (0, 989)	0.17745612097336405
  (0, 3337)	0.07640939630093298
  (0, 2173)	0.16112147463788695
  (0, 3573)	0.11838348731190705
  (0, 490)	0.11577270054822175
  (0, 7710)	0.19134329826197177
  (0, 2636)	0.23882387102172048
  (0, 5186)	0.1392001994260587
  (0, 3521)	0.13565393185264604
  (0, 1363)	0.14763025544055602
  (0, 2967)	0.12012243912234868
  (0, 2640)	0.11577270054822175
  (0, 4464)	0.10921540092393
  (0, 6972)	0.2527110483103282
  (0, 2999)	0.17673656075986588
  (0, 2672)	0.09783981925668135
  (0, 7357)	0.09234808454513324
  (0, 7369)	0.17058315097228646
  (0, 3388)	0.08019545477281471
  (0, 2671)	0.18860183605007147
  (0, 3542)	0.07640939630093298
  (0, 2755)	0.23882387102172048
  (0, 4721)	0.23882387102172048
  (0, 4802)	0.09765418198146887
  :	:
  (1176, 315)	0.10209046490055426
  (1176, 5474)	0.08433490840610683
  (1176, 1881)	0.11142002160842965
  (1176, 1888)	0.10321852722445886
  (1176, 7643)	0.0726562305544147
  (1176, 85)	0.087386293291498
 

In [10]:
print(term_docs_train)

  (0, 3)	0.1943234373521608
  (0, 198)	0.14246457245812405
  (0, 381)	0.06608659390491707
  (0, 490)	0.11577270054822175
  (0, 989)	0.17745612097336405
  (0, 1363)	0.14763025544055602
  (0, 2173)	0.16112147463788695
  (0, 2636)	0.23882387102172048
  (0, 2640)	0.11577270054822175
  (0, 2671)	0.18860183605007147
  (0, 2672)	0.09783981925668135
  (0, 2755)	0.23882387102172048
  (0, 2967)	0.12012243912234868
  (0, 2999)	0.17673656075986588
  (0, 3337)	0.07640939630093298
  (0, 3388)	0.08019545477281471
  (0, 3521)	0.13565393185264604
  (0, 3542)	0.07640939630093298
  (0, 3573)	0.11838348731190705
  (0, 4198)	0.30728960209367046
  (0, 4464)	0.10921540092393
  (0, 4721)	0.23882387102172048
  (0, 4802)	0.09765418198146887
  (0, 4896)	0.1943234373521608
  (0, 5186)	0.1392001994260587
  :	:
  (1176, 4464)	0.07552551488413548
  (1176, 4560)	0.14873633513020143
  (1176, 4563)	0.10154596203083036
  (1176, 4702)	0.16515340943628473
  (1176, 4838)	0.1997287823841358
  (1176, 5078)	0.0962608445758486

#SVM Set up
- kernel : SVC 에서 사용할 커널 알고리즘 지금은 linear 를 이용해서 표현한다.
- C : 패널티 함수
- random_state : 데이터를 섞을 때 사용

SVC를 확률 모델 형태로 쓰는 법도 있음

In [8]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=42)

#Fitting

In [9]:
svm.fit(term_docs_train, label_train)
accuracy = svm.score(term_docs_test, label_test)
print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100))

The accuracy on testing set is: 96.4%
