# Header Import

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

# Dataset Load

In [2]:
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

# Clean Data function

In [3]:
def letters_only(astr):
    for c in astr:
        if not c.isalpha():
            return False
    return True

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        # 이름 지우기 lemmatizer 적용
        cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower())
                                      for word in doc.split()
                                      if letters_only(word)
                                      and word not in all_names]))
    return cleaned_docs


# Binary classification

In [4]:
categories = ['comp.graphics', 'sci.space']

data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)

cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_test.data)
label_test = data_test.target

from collections import Counter
Counter(label_train)

Counter({0: 584, 1: 593})

#TF-IDF

In [6]:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)
print(term_docs_test)

  (0, 7926)	0.10331909800580447
  (0, 7807)	0.09868520442742641
  (0, 7572)	0.0694259285030913
  (0, 7421)	0.13699310481566476
  (0, 7367)	0.1839946034462866
  (0, 7358)	0.10452562833087464
  (0, 7357)	0.08880138133525056
  (0, 7030)	0.18999929000748855
  (0, 6941)	0.1706407744832919
  (0, 6154)	0.13509167609333483
  (0, 5763)	0.1338541026818493
  (0, 5731)	0.14781225308827636
  (0, 5572)	0.149694742276498
  (0, 5108)	0.1706407744832919
  (0, 4760)	0.19346929587830744
  (0, 4235)	0.1973484324092813
  (0, 4066)	0.19346929587830744
  (0, 3592)	0.19346929587830744
  (0, 3417)	0.13634091331350331
  (0, 3388)	0.07711548318204789
  (0, 3337)	0.07347483136154573
  (0, 3238)	0.149694742276498
  (0, 3096)	0.1263854153407918
  (0, 3087)	0.24300547519931237
  (0, 2838)	0.1451760789597685
  :	:
  (782, 1363)	0.07071081558566485
  (782, 1307)	0.0891193447390158
  (782, 1304)	0.08097539638602044
  (782, 1300)	0.06996501438375423
  (782, 1264)	0.09033507127828737
  (782, 1189)	0.07960753768029177
  (

#SVM Set up

In [8]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=42)

#Fitting

In [9]:
svm.fit(term_docs_train, label_train)
accuracy = svm.score(term_docs_test, label_test)
print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100))

The accuracy on testing set is: 96.4%
