# 库

In [1]:
import json
import pkuseg
import pandas as pd
import jieba
import numpy
from sklearn.metrics import classification_report

# 训练数据获取

In [2]:
def get_data(filename):
    f = open(filename, encoding = 'utf-8')
    lines = f.readlines()
    seg = pkuseg.pkuseg()
    data = []
    for line in lines:
        dic = eval(line)
        data.append(dic)
    f.close()
    data = pd.DataFrame(data)
    seg = pkuseg.pkuseg()
    stopwords = [line.strip() for line in open('stopwords.txt', encoding="utf-8").readlines()]        #加载停用词
    sentence = [i for i in seg.cut(dic['sentence']) if i not in stopwords]
    data = pd.DataFrame(data)
    for i in range(len(data['sentence'])):
        sentence = [i for i in seg.cut(data['sentence'][i]) if i not in stopwords]
        keywords = [i for i in seg.cut(data['keywords'][i]) if i not in stopwords]
        data['sentence'][i] = sentence
        data['keywords'][i] = keywords
    return data

In [3]:
train_file = "./tnews_public/train.json"
test_file = './tnews_public/test.json'
train_data = get_data(train_file)
test_data = get_data(test_file)

In [4]:
x_train = train_data['sentence']
y_train = train_data['label']
x_test = test_data['sentence']
y_test = test_data['label']

In [5]:
words = []
for line_index in range(len(x_train)):
    try:
        #x_train[line_index][word_index] = str(x_train[line_index][word_index])
        words.append(' '.join(x_train[line_index]))
    except:
        print(line_index,word_index)
words[0]

'上课 时 学生 手机 响 不停 老师 一怒之下 手机 摔 家长 发票 老师 赔 看待 事'

In [6]:
test_words = []
for line_index in range(len(x_test)):
    try:
        #x_train[line_index][word_index] = str(x_train[line_index][word_index])
        test_words.append(' '.join(x_test[line_index]))
    except:
         print (line_index,word_index)
test_words[0]

'哈登 保罗 一见钟情'

# 词频向量

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
 
vec = CountVectorizer(analyzer='word', max_features=4000,  lowercase = False)
vec.fit(words)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=4000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

# 分类

In [8]:
# 贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(words), y_train)
classifier.score(vec.transform(test_words), y_test)

0.5084

In [9]:
y_pred=classifier.predict(vec.transform(test_words))
target_names=["news_story","news_culture", "news_entertainment", "news_sports","news_finance","news_house","news_car","news_edu","news_tech","news_military","news_travel","news_world","news_stock","news_agriculture", "news_game"]
print(classification_report(y_test,y_pred,target_names=target_names))

                    precision    recall  f1-score   support

        news_story       0.49      0.43      0.46       116
      news_culture       0.51      0.44      0.47       367
news_entertainment       0.46      0.54      0.50       443
       news_sports       0.71      0.60      0.65       393
      news_finance       0.46      0.53      0.49       496
        news_house       0.61      0.52      0.56       183
          news_car       0.60      0.51      0.55       395
          news_edu       0.49      0.56      0.52       309
         news_tech       0.39      0.53      0.45       569
     news_military       0.53      0.45      0.49       344
       news_travel       0.51      0.48      0.49       346
        news_world       0.48      0.49      0.49       442
        news_stock       0.00      0.00      0.00        21
  news_agriculture       0.57      0.47      0.52       246
         news_game       0.63      0.50      0.56       330

          accuracy                    

In [10]:
# SVM分类器
from sklearn import svm
from sklearn.svm import SVC
svm_model = svm.SVC(kernel = 'linear',probability=True)
svm_model.fit(vec.transform(words), y_train)
svm_model.score(vec.transform(test_words), y_test)

0.4856

In [11]:
y_pred=svm_model.predict(vec.transform(test_words))
target_names=["news_story","news_culture", "news_entertainment", "news_sports","news_finance","news_house","news_car","news_edu","news_tech","news_military","news_travel","news_world","news_stock","news_agriculture", "news_game"]
print(classification_report(y_test,y_pred,target_names=target_names))

                    precision    recall  f1-score   support

        news_story       0.44      0.37      0.40       116
      news_culture       0.29      0.62      0.40       367
news_entertainment       0.42      0.53      0.47       443
       news_sports       0.70      0.56      0.63       393
      news_finance       0.42      0.44      0.43       496
        news_house       0.60      0.55      0.57       183
          news_car       0.56      0.51      0.54       395
          news_edu       0.57      0.49      0.53       309
         news_tech       0.54      0.48      0.51       569
     news_military       0.47      0.36      0.41       344
       news_travel       0.53      0.41      0.47       346
        news_world       0.51      0.50      0.50       442
        news_stock       0.12      0.05      0.07        21
  news_agriculture       0.53      0.41      0.46       246
         news_game       0.67      0.49      0.57       330

          accuracy                    

In [12]:
# 决策树
from sklearn import tree
tree_model = tree.DecisionTreeClassifier()
tree_model.fit(vec.transform(words), y_train)
tree_model.score(vec.transform(test_words), y_test)

0.4286

In [13]:
y_pred=tree_model.predict(vec.transform(test_words))
target_names=["news_story","news_culture", "news_entertainment", "news_sports","news_finance","news_house","news_car","news_edu","news_tech","news_military","news_travel","news_world","news_stock","news_agriculture", "news_game"]
print(classification_report(y_test,y_pred,target_names=target_names))

                    precision    recall  f1-score   support

        news_story       0.21      0.21      0.21       116
      news_culture       0.29      0.57      0.38       367
news_entertainment       0.41      0.44      0.42       443
       news_sports       0.58      0.53      0.56       393
      news_finance       0.40      0.41      0.41       496
        news_house       0.44      0.43      0.43       183
          news_car       0.48      0.47      0.48       395
          news_edu       0.47      0.46      0.47       309
         news_tech       0.46      0.41      0.43       569
     news_military       0.40      0.41      0.40       344
       news_travel       0.43      0.32      0.37       346
        news_world       0.49      0.41      0.45       442
        news_stock       0.29      0.10      0.14        21
  news_agriculture       0.39      0.34      0.37       246
         news_game       0.63      0.44      0.51       330

          accuracy                    

In [14]:
# 随机森林
from sklearn.ensemble import RandomForestClassifier
RF_model = RandomForestClassifier(class_weight='balanced', random_state=1)
RF_model.fit(vec.transform(words), y_train)
RF_model.score(vec.transform(test_words), y_test)

0.4654

In [15]:
y_pred=RF_model.predict(vec.transform(test_words))
target_names=["news_story","news_culture", "news_entertainment", "news_sports","news_finance","news_house","news_car","news_edu","news_tech","news_military","news_travel","news_world","news_stock","news_agriculture", "news_game"]
print(classification_report(y_test,y_pred,target_names=target_names))

                    precision    recall  f1-score   support

        news_story       0.24      0.38      0.30       116
      news_culture       0.34      0.57      0.42       367
news_entertainment       0.49      0.42      0.45       443
       news_sports       0.67      0.55      0.61       393
      news_finance       0.48      0.42      0.44       496
        news_house       0.41      0.54      0.46       183
          news_car       0.52      0.50      0.51       395
          news_edu       0.51      0.53      0.52       309
         news_tech       0.54      0.43      0.48       569
     news_military       0.43      0.45      0.44       344
       news_travel       0.46      0.39      0.42       346
        news_world       0.50      0.42      0.46       442
        news_stock       0.14      0.33      0.20        21
  news_agriculture       0.37      0.43      0.39       246
         news_game       0.58      0.53      0.55       330

          accuracy                    

# 可以使用另外一种方式来构造TF-IDF向量

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
 
vectorizer = TfidfVectorizer(analyzer='word', max_features=4000,  lowercase = False)
vectorizer.fit(words)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=False, max_df=1.0, max_features=4000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [17]:
feature = vectorizer.fit_transform(words)
# print('特征：\n', feature.toarray(), '\n\n或者\n', feature.todense())

In [18]:
# 贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vectorizer.transform(words), y_train)
classifier.score(vectorizer.transform(test_words), y_test)

0.501

In [19]:
y_pred=classifier.predict(vec.transform(test_words))
target_names=["news_story","news_culture", "news_entertainment", "news_sports","news_finance","news_house","news_car","news_edu","news_tech","news_military","news_travel","news_world","news_stock","news_agriculture", "news_game"]
print(classification_report(y_test,y_pred,target_names=target_names))

                    precision    recall  f1-score   support

        news_story       0.62      0.40      0.48       116
      news_culture       0.52      0.42      0.46       367
news_entertainment       0.46      0.56      0.50       443
       news_sports       0.73      0.61      0.66       393
      news_finance       0.43      0.52      0.47       496
        news_house       0.66      0.49      0.56       183
          news_car       0.59      0.49      0.54       395
          news_edu       0.50      0.56      0.53       309
         news_tech       0.37      0.54      0.44       569
     news_military       0.53      0.43      0.48       344
       news_travel       0.54      0.48      0.51       346
        news_world       0.47      0.53      0.50       442
        news_stock       0.00      0.00      0.00        21
  news_agriculture       0.61      0.45      0.52       246
         news_game       0.64      0.51      0.57       330

          accuracy                    

In [20]:
# SVM分类器
from sklearn import svm
from sklearn.svm import SVC
svm_model = svm.SVC(kernel = 'linear',probability=True)
svm_model.fit(vectorizer.transform(words), y_train)
svm_model.score(vectorizer.transform(test_words), y_test)

0.5028

In [21]:
y_pred=svm_model.predict(vec.transform(test_words))
target_names=["news_story","news_culture", "news_entertainment", "news_sports","news_finance","news_house","news_car","news_edu","news_tech","news_military","news_travel","news_world","news_stock","news_agriculture", "news_game"]
print(classification_report(y_test,y_pred,target_names=target_names))

                    precision    recall  f1-score   support

        news_story       0.39      0.51      0.44       116
      news_culture       0.35      0.52      0.42       367
news_entertainment       0.46      0.50      0.48       443
       news_sports       0.70      0.57      0.63       393
      news_finance       0.42      0.46      0.44       496
        news_house       0.51      0.57      0.54       183
          news_car       0.58      0.48      0.53       395
          news_edu       0.53      0.54      0.53       309
         news_tech       0.52      0.50      0.51       569
     news_military       0.47      0.40      0.43       344
       news_travel       0.54      0.43      0.48       346
        news_world       0.47      0.56      0.51       442
        news_stock       0.12      0.05      0.07        21
  news_agriculture       0.57      0.44      0.50       246
         news_game       0.66      0.55      0.60       330

          accuracy                    

In [22]:
# 决策树
from sklearn import tree
tree_model = tree.DecisionTreeClassifier()
tree_model.fit(vectorizer.transform(words), y_train)
tree_model.score(vectorizer.transform(test_words), y_test)

0.444

In [23]:
y_pred=tree_model.predict(vec.transform(test_words))
target_names=["news_story","news_culture", "news_entertainment", "news_sports","news_finance","news_house","news_car","news_edu","news_tech","news_military","news_travel","news_world","news_stock","news_agriculture", "news_game"]
print(classification_report(y_test,y_pred,target_names=target_names))

                    precision    recall  f1-score   support

        news_story       0.24      0.26      0.25       116
      news_culture       0.26      0.54      0.35       367
news_entertainment       0.37      0.42      0.39       443
       news_sports       0.53      0.42      0.47       393
      news_finance       0.34      0.38      0.36       496
        news_house       0.40      0.38      0.39       183
          news_car       0.43      0.44      0.44       395
          news_edu       0.42      0.41      0.42       309
         news_tech       0.42      0.38      0.40       569
     news_military       0.41      0.35      0.38       344
       news_travel       0.42      0.29      0.35       346
        news_world       0.40      0.33      0.36       442
        news_stock       0.11      0.05      0.07        21
  news_agriculture       0.45      0.35      0.39       246
         news_game       0.51      0.34      0.41       330

          accuracy                    

In [24]:
# 随机森林
from sklearn.ensemble import RandomForestClassifier
RF_model = RandomForestClassifier(class_weight='balanced', random_state=1)
RF_model.fit(vectorizer.transform(words), y_train)
RF_model.score(vectorizer.transform(test_words), y_test)

0.4906

In [25]:
y_pred=RF_model.predict(vec.transform(test_words))
target_names=["news_story","news_culture", "news_entertainment", "news_sports","news_finance","news_house","news_car","news_edu","news_tech","news_military","news_travel","news_world","news_stock","news_agriculture", "news_game"]
print(classification_report(y_test,y_pred,target_names=target_names))

                    precision    recall  f1-score   support

        news_story       0.20      0.40      0.27       116
      news_culture       0.33      0.54      0.41       367
news_entertainment       0.47      0.41      0.44       443
       news_sports       0.63      0.52      0.57       393
      news_finance       0.45      0.35      0.39       496
        news_house       0.37      0.55      0.44       183
          news_car       0.48      0.47      0.47       395
          news_edu       0.51      0.51      0.51       309
         news_tech       0.51      0.39      0.44       569
     news_military       0.40      0.47      0.43       344
       news_travel       0.46      0.39      0.42       346
        news_world       0.51      0.36      0.43       442
        news_stock       0.10      0.33      0.15        21
  news_agriculture       0.37      0.42      0.39       246
         news_game       0.57      0.53      0.55       330

          accuracy                    