In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.model_selection import train_test_split

import pandas as pd
# import tensorflow
# from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers

In [2]:
df_train = pd.read_csv('training.csv')
train_set = pd.DataFrame(columns=['words', 'topic'])
train_set['words'] = df_train['article_words'].str.replace(',',' ')
train_set['topic'] = df_train['topic']

X_train, X_test, y_train, y_test = train_test_split(train_set['words'], train_set['topic'], test_size = 0.0526, random_state = 1)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [32]:
#词语级tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_set['words'])
xtrain_tfidf = tfidf_vect.transform(X_train)
xtest_tfidf = tfidf_vect.transform(X_test)

# ngram 级tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train_set['words'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(X_train)
xtest_tfidf_ngram = tfidf_vect_ngram.transform(X_test)

#词性级tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(train_set['words'])
xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_train)
xtest_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_test)

In [35]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)

    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return metrics.accuracy_score(predictions, y_test)

In [36]:
#特征为词语级别TF-IDF向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, y_train, xtest_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

#特征为多个词语级别TF-IDF向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, y_train, xtest_tfidf_ngram)
print( "NB, N-Gram Vectors: ", accuracy)

#特征为词性级别TF-IDF向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, y_train, xtest_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

NB, WordLevel TF-IDF:  0.742
NB, N-Gram Vectors:  0.74
NB, CharLevel Vectors:  0.728


In [53]:
#特征为词语级别TF-IDF向量的线性分类器
# accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, y_train, xtest_tfidf)
lg = linear_model.LogisticRegressionCV(max_iter = 2000).fit(xtrain_tfidf, y_train)
predictions = lg.predict(xtest_tfidf)
accuracy = metrics.accuracy_score(predictions, y_test)
print("LR, WordLevel TF-IDF: ", accuracy)

#特征为多个词语级别TF-IDF向量的线性分类器
# accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, y_train, xtest_tfidf_ngram)
# print("LR, N-Gram Vectors: ", accuracy)
lg = linear_model.LogisticRegressionCV(max_iter = 2000).fit(xtrain_tfidf_ngram, y_train)
predictions = lg.predict(xtest_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, y_test)
print("LR, WordLevel TF-IDF: ", accuracy)

#特征为词性级别TF-IDF向量的线性分类器
# accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, y_train, xtest_tfidf_ngram_chars)
# print("LR, CharLevel Vectors: ", accuracy)
# lg = linear_model.LogisticRegressionCV(max_iter = 2000).fit(xtrain_tfidf_ngram_chars, y_train)
# predictions = lg.predict(xtest_tfidf_ngram_chars)
# accuracy = metrics.accuracy_score(predictions, y_test)
# print("LR, WordLevel TF-IDF: ", accuracy)

LR, WordLevel TF-IDF:  0.776
LR, WordLevel TF-IDF:  0.752


In [41]:
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, y_train, xtest_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)

SVM, N-Gram Vectors:  0.726
