<a href="https://colab.research.google.com/github/JayYongjaeKim/MoLab/blob/main/News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

In [2]:
def preprocessing (num_words = None):
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=num_words, test_split=0.2)
    word_index = reuters.get_word_index(path="reuters_word_index.json")
    index_to_word = { index + 3 : word for word, index in word_index.items()}
    for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
        index_to_word[index]=token

    x_train = [' '.join([index_to_word[index] for index in sequence]) for sequence in x_train]
    x_test = [' '.join([index_to_word[index] for index in sequence]) for sequence in x_test]

    return x_train, x_test, y_train, y_test

In [3]:
def vectorizer (x_train, x_test):
    dtmvector = CountVectorizer()
    x_train_dtm = dtmvector.fit_transform(x_train)
    x_test_dtm = dtmvector.transform(x_test)

    tfidf_transformer = TfidfTransformer()
    x_train_tfidfv = tfidf_transformer.fit_transform(x_train_dtm)
    x_test_tfidfv = tfidf_transformer.transform(x_test_dtm)

    return x_train_tfidfv, x_test_tfidfv

In [18]:
def modeling (x_train, y_train, x_test, y_test):
    model = {'Naive Bayes': MultinomialNB(),
        'Logistic Regression': LogisticRegression(C=10000, penalty='l2', max_iter=3000),
        # 'SVM': LinearSVC(C=1000, penalty='l1', max_iter=3000, dual=False),
        # 'Decision Tree': DecisionTreeClassifier(max_depth=10, random_state=0),
        'Random Forest': RandomForestClassifier(n_estimators=5, random_state=0),
        # 'Gradient Boosting': GradientBoostingClassifier(random_state=0),
        }

    model_results = {}
    for name, model in model.items():
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        acc = accuracy_score(y_test, y_pred)
        model_results[name] = acc
    return model_results

In [19]:
num_list = [None, 5000, 10000, 20000]
results = {}

for num_words in num_list:
    print({num_words})
    x_train, x_test, y_train, y_test = preprocessing(num_words = num_words)
    x_train_vec, x_test_vec = vectorizer(x_train, x_test)
    model_result = modeling(x_train_vec, y_train, x_test_vec, y_test)
    results[num_words] = model_result

{None}
{5000}
{10000}
{20000}


In [21]:
for num_words, model_results in results.items():
    print("="*10)
    print(f"num_words={num_words}:")
    for model_name, accuracy in model_results.items():
        print(f"{model_name}: {accuracy:.4f}")

num_words=None:
Naive Bayes: 0.5997
Logistic Regression: 0.8112
Random Forest: 0.6545
num_words=5000:
Naive Bayes: 0.6732
Logistic Regression: 0.8059
Random Forest: 0.7012
num_words=10000:
Naive Bayes: 0.6567
Logistic Regression: 0.8085
Random Forest: 0.6741
num_words=20000:
Naive Bayes: 0.6193
Logistic Regression: 0.8121
Random Forest: 0.6714
