In [2]:
import pickle
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_colwidth', 100)

In [27]:
from lxml import etree
from tqdm.notebook import tqdm
import os

def extract_info(xml_string, filename):
    root = etree.fromstring(xml_string)

    bibl_structs = root.findall(".//{*}biblStruct")
    information_list = []

    for bibl_struct in bibl_structs:
        information = {'Original Filename': filename[:-7] + 'pdf'}
        paper_title = bibl_struct.find(".//{*}title[@level='a']")
        paper_title = paper_title.text if paper_title is not None else ''
        information['Title'] = paper_title
    
        authors = bibl_struct.findall(".//{*}author/{*}persName")
        author_names = []
        for author in authors:
            first_name = author.find("{*}forename[@type='first']")
            last_name = author.find("{*}surname")

            first_name = first_name.text if first_name is not None else ''
            last_name = last_name.text if last_name is not None else ''

            author_names.append(f"{first_name} {last_name}")
        information['Authors'] = author_names

        publication_venue = bibl_struct.find(".//{*}title[@level='j']")
        if publication_venue is not None:
            information['Publication Venue'] = publication_venue.text
        else:
            publication_venue = bibl_struct.find(".//{*}title[@level='m']")
            if publication_venue is not None:
                information['Publication Venue'] = publication_venue.text

        if len(information['Authors']) and 'Publication Venue' in information:
            information_list.append(information)

    return information_list

base_dir = './dataset/GROBID-output/references'
citations = []

for filename in tqdm(os.listdir(base_dir)):
    if filename.endswith('.xml'):
        with open(os.path.join(base_dir, filename), 'rb') as file:
            info = extract_info(file.read(), filename)
            citations.extend(info)

citations_df = pd.DataFrame(citations)

  0%|          | 0/10027 [00:00<?, ?it/s]

In [28]:
citations_df

Unnamed: 0,Original Filename,Title,Authors,Publication Venue
0,0704.0213.pdf,Tensor product multiplicities and convex polytopes in partition space,"[A Berenstein, A Zelevinsky]",J. Geom. Phys
1,0704.0213.pdf,On the computation of Clebsch-Gordan coefficients and the dilation effect,"[J Loera, T Mcallister]",Experiment. Math
2,0704.0213.pdf,Polynomial algorithms for computing the Smith and Hermite normal forms of an integer matrix,"[R Kannan, A Bachem]",SIAM J. Comput
3,0704.0213.pdf,A new polynomial-time algorithm for linear programming,[N Karmarkar],Combinatorica
4,0704.0213.pdf,A polynomial algorithm for linear programming,[L Khachian],Doklady Akedamii Nauk SSSR
...,...,...,...,...
338458,2112.04902.pdf,,"[C Spielberger, R Gorsuch]","State-trait anxiety inventory for adults: Manual and sample: Manual, instrument and scoring guide"
338459,2112.04902.pdf,Real-time fmri neurofeedback: progress and challenges,"[J Sulzer, S Haller, F Scharnowski, N Weiskopf, N Birbaumer, M Blefari, A Bruehl, L Cohen, R Dec...",Neuroimage
338460,2112.04902.pdf,,"[F Weathers, D Blake, P Schnurr, D Kaloupek, B Marx, T Keane]",The clinician-administered ptsd scale for dsm-5 (caps-5)
338461,2112.04902.pdf,Conn: a functional connectivity toolbox for correlated and anticorrelated brain networks,"[S Whitfield-Gabrieli, A Nieto-Castanon]",Brain connectivity


In [31]:
import pickle

with open("citations_df.pkl", 'wb') as file:
    pickle.dump(citations_df, file, protocol=pickle.HIGHEST_PROTOCOL)