In [1]:
import os
import re
import numpy as np
from bs4 import BeautifulSoup

lang_spellings = [
    'armenian',
    'belarusian',
    'bulgarian',
    'catalan',
    'chinese',
    'croatian',
    'czech',
    'danish',
    'dutch',
    'english',
    'esperanto',
    'estonian',
    'farsi',
    'finnish',
    'french',
    'georgian',
    'german',
    'greek',
    'hebrew',
    'hindi',
    'hungarian',
    'icelandic',
    'indonesian',
    'italian',
    'japanese',
    'korean',
    'latin',
    'latvian',
    'macedonian',
    'marathi',
    'norwegian',
    'polish',
    'portuguese',
    'romanian',
    'russian',
    'sanskrit',
    'serbian',
    'slovak',
    'slovenian',
    'spanish',
    'swedish',
    'turkish',
    'ukrainian',
    'vietnamese',
    'chinese',
    'chines',
    'chineses',
    'chinses',
    'german',
    'deutsch',
    'germain',
    'geman',
    'finnish',
    'finish',
    'french',
    'franch',
    'hungarian',
    'hugarian',
    'italian',
    'itallian',
    'japanese',
    'janpanese',
    'japanease',
    'japaneese',
    'japanes',
    'japanase',
    'japaneses',
    'japaniese',
    'japanse',
    'japansese',
    'japenese',
    'japense',
    'japnese',
    'macedonian',
    'makedonian',
    'farsi',
    'persian',
    'portuguese',
    'portuguise',
    'protuguese',
    'russian',
    'rassian',
    'rissian',
    'rrussian',
    'ruassian',
    'rus',
    'rusian',
    'russ',
    'russain',
    'russan',
    'russe',
    'russiam',
    'russian',
    'russiand',
    'russin',
    'russina',
    'russinan',
    'russion',
    'russisan',
    'russsian',
    'rudssian',
    'romanian',
    'roumanian',
    'slovenian',
    'slovene',
    'ukrainian',
    'ukainian',
    'ukraininian',
    'ukranian',
    'ukrainain'
]

In [2]:
# process GROBID output

in_lang_patt = re.compile(r'[\[\(]\s*in\s*\w+\s*[\]\)]', re.I)
fn_patt = re.compile(r'^(\d+)_(\d+)_((pre)|(pub)).*$')

def count_cross_ling_cits(path):
    with open(path) as f:
        soup = BeautifulSoup(f, 'lxml')
    cross_ling_cit_count = 0
    refsecitems = soup.findAll('biblstruct')
    if len(refsecitems) == 0:
        return -1
    for refsecitem in refsecitems:
        notes = refsecitem.findAll('note')
        if len(notes) > 0:
            for note in refsecitem.findAll('note'):
                # detected by GROBID as a note, just look for languages mentioned
                for sp in lang_spellings:
                    if sp in note.text.lower():
                        cross_ling_cit_count += 1
                        break
        else:
            # looking at complete reference section entry string, heuristically match notes
            m = in_lang_patt.search(refsecitem.text)
            if m:
                for sp in lang_spellings:
                    if sp in m.group(0).lower():
                        cross_ling_cit_count += 1
                        break
    return cross_ling_cit_count

directory = 'grobid_out'

cit_counts = {}

for fn in os.listdir(directory):
    if fn.endswith('.xml'): 
        path = os.path.join(directory, fn)
        m = fn_patt.match(fn)
        ppr_idx = m.group(1)
        ppr_mid = m.group(2)
        ppr_typ = m.group(3)
        count = count_cross_ling_cits(path)
        if count == -1:
            continue
        if ppr_idx not in cit_counts:
            cit_counts[ppr_idx] = {}
        cit_counts[ppr_idx]['mid'] = ppr_mid
        if ppr_typ not in cit_counts[ppr_idx]:
            cit_counts[ppr_idx][ppr_typ] = []
        cit_counts[ppr_idx][ppr_typ].append(count)

In [3]:
# analyze deltas

deltas = []
total_citations = 0

for ppr_idx, dic in cit_counts.items():
    try:
        delta = max(dic['pub']) - max(dic['pre'])
    except KeyError:
        continue
    total_citations += max(dic['pre'])
    deltas.append(delta)

print('total cross-lingual citations: {}'.format(total_citations))
print('#paper pairs: {}'.format(len(deltas)))
print('#inc: {}'.format(
    len([d for d in deltas if d > 0])
))
print('#dec: {}'.format(
    len([d for d in deltas if d < 0])
))
print('mean: {}'.format(np.mean(deltas)))
print('median: {}'.format(np.median(deltas)))
print('std: {}'.format(np.std(deltas)))

total cross-lingual citations: 689
#paper pairs: 498
#inc: 33
#dec: 70
mean: -0.11646586345381527
median: 0.0
std: 0.8204771115648289


In [4]:
# analyze deltas from manual analysis on 100 sample papers

manual_deltas = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, -1, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

print('#paper pairs: {}'.format(len(manual_deltas)))
print('#inc: {}'.format(
    len([d for d in manual_deltas if d > 0])
))
print('#dec: {}'.format(
    len([d for d in manual_deltas if d < 0])
))
print('mean: {}'.format(np.mean(manual_deltas)))
print('median: {}'.format(np.median(manual_deltas)))
print('std: {}'.format(np.std(manual_deltas)))

#paper pairs: 100
#inc: 4
#dec: 7
mean: -0.02
median: 0.0
std: 0.5287721626560914
