In [1]:
# 导入所需的package
import seaborn as sns #用于画图
from bs4 import BeautifulSoup #用于爬取arxiv的数据
import re #用于正则表达式，匹配字符串的模式
import requests #用于网络连接，发送网络请求，使用域名获取对应信息
import json #读取数据，我们的数据为json格式的
import pandas as pd #数据处理，数据分析
import matplotlib.pyplot as plt #画图工具

In [2]:
def readArxivFile(path, columns=['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'abstract', 'versions',
       'update_date', 'authors_parsed'], count=None):
    '''
    定义读取文件的函数
        path: 文件路径
        columns: 需要选择的列
        count: 读取行数
    '''
    
    data  = []
    with open(path, 'r') as f: 
        for idx, line in enumerate(f): 
            if idx == count:
                break
                
            d = json.loads(line)
            d = {col : d[col] for col in columns}
            data.append(d)

    data = pd.DataFrame(data)
    return data

data = readArxivFile('arxiv-metadata-oai-snapshot.json', 
                     ['id', 'title', 'categories', 'abstract'],200000)

In [3]:
data

Unnamed: 0,abstract,categories,id,title
0,A fully differential calculation in perturba...,hep-ph,0704.0001,Calculation of prompt diphoton production cros...
1,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,0704.0002,Sparsity-certifying Graph Decompositions
2,The evolution of Earth-Moon system is descri...,physics.gen-ph,0704.0003,The evolution of the Earth-Moon system based o...
3,We show that a determinant of Stirling cycle...,math.CO,0704.0004,A determinant of Stirling cycle numbers counts...
4,In this paper we show how to compute the $\L...,math.CA math.FA,0704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...
5,We study the two-particle wave function of p...,cond-mat.mes-hall,0704.0006,Bosonic characters of atomic Cooper pairs acro...
6,A rather non-standard quantum representation...,gr-qc,0704.0007,Polymer Quantum Mechanics and its Continuum Limit
7,A general formulation was developed to repre...,cond-mat.mtrl-sci,0704.0008,Numerical solution of shock and ramp compressi...
8,We discuss the results from the combined IRA...,astro-ph,0704.0009,"The Spitzer c2d Survey of Large, Nearby, Inste..."
9,Partial cubes are isometric subgraphs of hyp...,math.CO,0704.0010,"Partial cubes: structures, characterizations, ..."


In [4]:
data['text'] = data['title'] + data['abstract']

In [5]:
data.head()

Unnamed: 0,abstract,categories,id,title,text
0,A fully differential calculation in perturba...,hep-ph,704.0001,Calculation of prompt diphoton production cros...,Calculation of prompt diphoton production cros...
1,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,704.0002,Sparsity-certifying Graph Decompositions,Sparsity-certifying Graph Decompositions We d...
2,The evolution of Earth-Moon system is descri...,physics.gen-ph,704.0003,The evolution of the Earth-Moon system based o...,The evolution of the Earth-Moon system based o...
3,We show that a determinant of Stirling cycle...,math.CO,704.0004,A determinant of Stirling cycle numbers counts...,A determinant of Stirling cycle numbers counts...
4,In this paper we show how to compute the $\L...,math.CA math.FA,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...


In [6]:

data['text'] = data['text'].apply(lambda x: x.replace('\n',' '))
data['text'] = data['text'].apply(lambda x: x.lower())
data = data.drop(['abstract', 'title'], axis=1)

In [7]:
data

Unnamed: 0,categories,id,text
0,hep-ph,0704.0001,calculation of prompt diphoton production cros...
1,math.CO cs.CG,0704.0002,sparsity-certifying graph decompositions we d...
2,physics.gen-ph,0704.0003,the evolution of the earth-moon system based o...
3,math.CO,0704.0004,a determinant of stirling cycle numbers counts...
4,math.CA math.FA,0704.0005,from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...
5,cond-mat.mes-hall,0704.0006,bosonic characters of atomic cooper pairs acro...
6,gr-qc,0704.0007,polymer quantum mechanics and its continuum li...
7,cond-mat.mtrl-sci,0704.0008,numerical solution of shock and ramp compressi...
8,astro-ph,0704.0009,"the spitzer c2d survey of large, nearby, inste..."
9,math.CO,0704.0010,"partial cubes: structures, characterizations, ..."


In [8]:
# 多个类别，包含子分类
data['categories'] = data['categories'].apply(lambda x : x.split(' '))

In [9]:
data

Unnamed: 0,categories,id,text
0,[hep-ph],0704.0001,calculation of prompt diphoton production cros...
1,"[math.CO, cs.CG]",0704.0002,sparsity-certifying graph decompositions we d...
2,[physics.gen-ph],0704.0003,the evolution of the earth-moon system based o...
3,[math.CO],0704.0004,a determinant of stirling cycle numbers counts...
4,"[math.CA, math.FA]",0704.0005,from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...
5,[cond-mat.mes-hall],0704.0006,bosonic characters of atomic cooper pairs acro...
6,[gr-qc],0704.0007,polymer quantum mechanics and its continuum li...
7,[cond-mat.mtrl-sci],0704.0008,numerical solution of shock and ramp compressi...
8,[astro-ph],0704.0009,"the spitzer c2d survey of large, nearby, inste..."
9,[math.CO],0704.0010,"partial cubes: structures, characterizations, ..."


In [10]:
# 单个类别，不包含子分类
data['categories_big'] = data['categories'].apply(lambda x : [xx.split('.')[0] for xx in x])

In [11]:
data

Unnamed: 0,categories,id,text,categories_big
0,[hep-ph],0704.0001,calculation of prompt diphoton production cros...,[hep-ph]
1,"[math.CO, cs.CG]",0704.0002,sparsity-certifying graph decompositions we d...,"[math, cs]"
2,[physics.gen-ph],0704.0003,the evolution of the earth-moon system based o...,[physics]
3,[math.CO],0704.0004,a determinant of stirling cycle numbers counts...,[math]
4,"[math.CA, math.FA]",0704.0005,from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...,"[math, math]"
5,[cond-mat.mes-hall],0704.0006,bosonic characters of atomic cooper pairs acro...,[cond-mat]
6,[gr-qc],0704.0007,polymer quantum mechanics and its continuum li...,[gr-qc]
7,[cond-mat.mtrl-sci],0704.0008,numerical solution of shock and ramp compressi...,[cond-mat]
8,[astro-ph],0704.0009,"the spitzer c2d survey of large, nearby, inste...",[astro-ph]
9,[math.CO],0704.0010,"partial cubes: structures, characterizations, ...",[math]


In [12]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data['categories_big'].iloc[:])

In [13]:
data_label

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [14]:

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
data_tfidf = vectorizer.fit_transform(data['text'].iloc[:])

In [15]:
# 划分训练集和验证集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_tfidf, data_label,test_size = 0.2,random_state = 16)

In [16]:
# 用sklearn构建多标签分类模型
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB  # 朴素贝叶斯多分类器
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train, y_train)

In [17]:
from sklearn.metrics import accuracy_score
##计算模型的准确率
print(clf.score(x_train,y_train))
print('训练数据集准确率',accuracy_score(y_train,clf.predict(x_train)))
print(clf.score(x_test,y_test))
print('测试集准确率',accuracy_score(y_test,clf.predict(x_test)))

0.57288125
训练数据集准确率 0.57288125
0.56925
测试集准确率 0.56925


In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.95      0.84      0.90      7756
           1       0.85      0.79      0.82      7392
           2       0.75      0.73      0.74      2891
           3       0.00      0.00      0.00         1
           4       0.72      0.48      0.57      2128
           5       0.47      0.67      0.55       905
           6       0.93      0.36      0.52       563
           7       0.71      0.68      0.69      3601
           8       0.75      0.60      0.67      3425
           9       0.85      0.88      0.87     10945
          10       0.46      0.12      0.19      1835
          11       0.79      0.05      0.10       724
          12       0.47      0.37      0.41       523
          13       0.55      0.37      0.44      1103
          14       0.67      0.14      0.23      3392
          15       0.77      0.17      0.27       653
          16       0.91      0.16      0.27       260
          17       0.89    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:

# parameter
max_features= 500
max_len= 150
embed_size=100
batch_size = 32
epochs = 5

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

ModuleNotFoundError: No module named 'keras'