导入需要的库

In [1]:
import seaborn as sns  # 用于画图
from bs4 import BeautifulSoup  # 用于爬取arxiv的数据
import re  # 用于正则表达式，匹配字符串的模式
import requests  # 用于网络连接，发送网络请求，使用域名获取对应信息
import json  # 读取数据，我们的数据为json格式的
import pandas as pd  # 数据处理，数据分析
import matplotlib.pyplot as plt  # 画图工具

读取字段

In [7]:
data = [] #初始化
#使⽤用with语句句优势： 1.⾃自动关闭⽂文件句句柄； 2.⾃自动显示（处理理）⽂文件读取数据异常
with open("H:\\arxiv-metadata-oai-snapshot.json", 'r') as f:
    for idx, line in enumerate(f):
        d = json.loads(line)
        d = {'title': d['title'], 'categories': d['categories'],'abstract': d['abstract']}
        data.append(d)
        if idx > 200000:
            break
data = pd.DataFrame(data) #将list变为dataframe格式，⽅方便便使⽤用pandas进⾏行行分析

In [8]:
data.head()

Unnamed: 0,title,categories,abstract
0,Calculation of prompt diphoton production cros...,hep-ph,A fully differential calculation in perturba...
1,Sparsity-certifying Graph Decompositions,math.CO cs.CG,"We describe a new algorithm, the $(k,\ell)$-..."
2,The evolution of the Earth-Moon system based o...,physics.gen-ph,The evolution of Earth-Moon system is descri...
3,A determinant of Stirling cycle numbers counts...,math.CO,We show that a determinant of Stirling cycle...
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,math.CA math.FA,In this paper we show how to compute the $\L...


为了了⽅方便便数据的处理理，我们可以将标题和摘要拼接⼀一起完成分类。

In [9]:
data['text'] = data['title'] + data['abstract']
data['text'] = data['text'].apply(lambda x: x.replace('\n',' '))
data['text'] = data['text'].apply(lambda x: x.lower())
data = data.drop(['abstract', 'title'], axis=1)

In [10]:
data.head()

Unnamed: 0,categories,text
0,hep-ph,calculation of prompt diphoton production cros...
1,math.CO cs.CG,sparsity-certifying graph decompositions we d...
2,physics.gen-ph,the evolution of the earth-moon system based o...
3,math.CO,a determinant of stirling cycle numbers counts...
4,math.CA math.FA,from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...


由于原始论⽂文有可能有多个类别，所以也需要处理理：

In [11]:
# 多个类别，包含⼦子分类
data['categories'] = data['categories'].apply(lambda x : x.split(' '))
# 单个类别，不不包含⼦子分类
data['categories_big'] = data['categories'].apply(lambda x : [xx.split('.')[0] for xx in x])

In [12]:
data.head()

Unnamed: 0,categories,text,categories_big
0,[hep-ph],calculation of prompt diphoton production cros...,[hep-ph]
1,"[math.CO, cs.CG]",sparsity-certifying graph decompositions we d...,"[math, cs]"
2,[physics.gen-ph],the evolution of the earth-moon system based o...,[physics]
3,[math.CO],a determinant of stirling cycle numbers counts...,[math]
4,"[math.CA, math.FA]",from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...,"[math, math]"


然后将类别进⾏行行编码，这⾥里里类别是多个，所以需要多编码：

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data['categories_big'].iloc[:])

In [14]:
print(data_label)

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


使⽤用TFIDF提取特征，限制最多4000个单词：

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
data_tfidf = vectorizer.fit_transform(data['text'].iloc[:])
print(data_tfidf)

  (0, 1339)	0.07713966986425407
  (0, 3234)	0.08325700493814182
  (0, 2529)	0.050550785991702214
  (0, 412)	0.02966354825388307
  (0, 541)	0.035583150240498684
  (0, 3309)	0.0717364472740219
  (0, 3247)	0.08000737351942044
  (0, 1278)	0.07936703492862042
  (0, 3639)	0.02245218063552933
  (0, 3300)	0.07940510009735878
  (0, 2846)	0.06520385816094423
  (0, 2924)	0.07382373036178137
  (0, 3669)	0.06075918251245269
  (0, 478)	0.0827760270517401
  (0, 1752)	0.0817066965469597
  (0, 978)	0.0669526895342047
  (0, 686)	0.08582798690462216
  (0, 2076)	0.046788346600009675
  (0, 1274)	0.04492028467968419
  (0, 2849)	0.14685314000178262
  (0, 1153)	0.13723340756219451
  (0, 3301)	0.054627201472470396
  (0, 1160)	0.06716552478756714
  (0, 580)	0.1066405965713316
  (0, 3631)	0.0857274178984978
  :	:
  (200001, 2344)	0.07452962963174609
  (200001, 526)	0.04842670603988993
  (200001, 2548)	0.015567166944746604
  (200001, 211)	0.02388166583327765
  (200001, 2607)	0.025589729270363445
  (200001, 2011)	

由于这⾥里里是多标签分类，可以使⽤用sklearn的多标签分类进⾏行行封装：

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_tfidf, data_label,test_size = 0.2,random_state = 1)
# 构建多标签分类模型
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train, y_train)

验证模型的精度：

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.95      0.85      0.90      7872
           1       0.85      0.78      0.81      7329
           2       0.77      0.72      0.74      2970
           3       0.00      0.00      0.00         2
           4       0.72      0.47      0.57      2149
           5       0.51      0.67      0.58       993
           6       0.89      0.35      0.50       538
           7       0.71      0.68      0.70      3657
           8       0.75      0.62      0.68      3382
           9       0.85      0.88      0.86     10809
          10       0.41      0.11      0.18      1796
          11       0.80      0.04      0.07       737
          12       0.44      0.33      0.38       540
          13       0.52      0.34      0.41      1070
          14       0.70      0.15      0.25      3435
          15       0.83      0.19      0.31       687
          16       0.88      0.18      0.30       249
          17       0.89    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
