In [60]:
import pandas as pd
from pathlib import Path
import json

Load data from disk

In [68]:
test_data =pd.read_json("Data Dump/s2-corpus-000.gz", compression="infer", lines=True)

In [62]:
subset = ["title", "paperAbstract", "authors", "doi", "year", "journalName", "journalVolume", "journalPages", "fieldsOfStudy"]

In [63]:
test_data = test_data.loc[: , subset]

In [64]:
test_data = test_data.loc[(pd.isna(test_data["paperAbstract"])==False) &
                          (test_data["paperAbstract"]!="")]

In [65]:
test_data.loc[:,"journal_name"] = test_data["journalName"]
test_data.loc[:,"journalName"] = test_data["journalName"].apply(lambda f: f.strip().lower())

In [8]:
test_data = test_data.set_index(keys="journalName", drop=True)

### Prepare Journal Name Set

In [9]:
journalpath = Path(r"C:\Users\Markus\Documents\Cambridge_Projects\GroupProject\conservation_synthesis\data\issn_journal_map.json")
standardNamePath = Path(r"C:\Users\Markus\Documents\Cambridge_Projects\GroupProject\conservation_synthesis\data\journal_name_to_standard.json")

In [10]:
with journalpath.open("rb") as f:
    journals = json.load(f)
with standardNamePath.open("rb") as f:
    standardName = json.load(f)

In [11]:
all_names = set(journals.values()) | set(standardName.keys())

In [12]:
import unidecode

In [13]:
all_names_standardised = [unidecode.unidecode(name.strip()).lower() for name in all_names]
all_names_standardised.append("australian ecology")
all_names_standardised.append('acta agriculturae scandinavica')

In [14]:
import re
the_pattern = re.compile("^\s*the\s*(.*)", flags=re.IGNORECASE)

In [15]:
more_names = []
for name in all_names_standardised:
    match = the_pattern.search(name)
    if match is not None:
        more_names.append(match.group(1))

In [16]:
all_names_standardised.extend(more_names)

In [17]:
all_names_set = set(all_names_standardised)

### Get all working indices

In [18]:
indices_to_query = all_names_set & set(test_data.index)

In [19]:
test_data.loc[indices_to_query]

Unnamed: 0_level_0,title,paperAbstract,authors,doi,year,journalVolume,journalPages,fieldsOfStudy,journal_name
journalName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
international journal of tropical insect science,"Influence of Maize, Cowpea and Sorghum Intercr...",The paper which covers a two season study from...,"[{'name': 'B. Amoako-Atta', 'ids': ['14363179...",10.1017/S1742758400004021,1983.0,4,47-57,[Biology],International Journal of Tropical Insect Science
weed research,Spatial distribution of weeds in arable crops:...,This paper reviews the literature concerning t...,"[{'name': 'Lisa J. Rew', 'ids': ['2549851']}, ...",10.1046/J.1365-3180.2001.00215.X,2001.0,41,1-18,[Computer Science],Weed Research
the journal of animal ecology,Early life learning ability predicts adult soc...,Social environments influence important ecolog...,"[{'name': 'Ellis J G Langley', 'ids': ['836577...",10.1111/1365-2656.13194,2020.0,,,"[Psychology, Medicine]",The Journal of animal ecology
hydrobiologia,Oligochaetes from six tropical crater lakes in...,The assemblage of littoral oligochaetes in six...,"[{'name': 'Laura Peralta', 'ids': ['145552933...",10.1007/978-94-010-0415-2_9,2002.0,467,109-116,[Biology],Hydrobiologia
hydrobiologia,Landscape responses to wetland eutrophication:...,Much of the historical Everglades has been eit...,"[{'name': 'Paul V. McCormick', 'ids': ['15255...",10.1007/s10750-008-9635-2,2008.0,621,105-114,[Environmental Science],Hydrobiologia
...,...,...,...,...,...,...,...,...,...
plant disease,Development of Phytophthora Fruit Rot Caused b...,Watermelon is an important crop grown in 44 st...,"[{'name': 'Chandrasekar S Kousik', 'ids': ['35...",10.1094/PDIS-06-17-0898-RE,2018.0,102 2,\n 370-374\n,"[Biology, Medicine]",Plant disease
west african journal of applied ecology,Use of Corn Cob and Rice Husk Biochar as Limin...,Most soils in Ghana are acid with those of the...,"[{'name': 'E. Frimpong Manso', 'ids': ['418548...",,2019.0,27,32-50,[Chemistry],West African Journal of Applied Ecology
aquacultural engineering,Energy use in Recirculating Aquaculture System...,Abstract Recirculating aquaculture systems (RA...,"[{'name': 'Maddi Badiola', 'ids': ['89678734'...",10.1016/J.AQUAENG.2018.03.003,2018.0,81,57-70,[Environmental Science],Aquacultural Engineering
invasive plant science and management,Role of nitrogen and herbicides in integrated ...,Abstract Mugwort (Artemisia vulgaris L.) is be...,"[{'name': 'Jatinder S. Aulakh', 'ids': ['14670...",10.1017/inp.2020.19,2020.0,13,189 - 198,[Biology],Invasive Plant Science and Management


### MAKE IT FAST

In [20]:
#!pip install multiprocess -- to work with ipython

In [23]:
from multiprocess import Pool
import functools

def get_relevant_data(relevant_journal_names, filepath):
    import pandas as pd
    test_data =pd.read_json(filepath, compression="infer", lines=True)
    subset = ["title", "paperAbstract", "authors", "doi", "year", "journalName", "journalVolume", "journalPages", "fieldsOfStudy"]
    test_data = test_data.loc[: , subset]
    test_data = test_data.loc[(pd.isna(test_data["paperAbstract"])==False) &
                          (test_data["paperAbstract"]!="")]
    test_data.loc[:,"journal_name"] = test_data["journalName"]
    test_data.loc[:,"journalName"] = test_data["journalName"].apply(lambda f: f.strip().lower())
    test_data = test_data.set_index(keys="journalName", drop=True)
    indices_to_query = relevant_journal_names & set(test_data.index)
    return test_data.loc[indices_to_query]

In [None]:
start=4500
end=5999
files = Path(".") / "Data Dump"
to_process = list(files.iterdir())
relevant = list(filter(lambda f: f.name.startswith("s2-corpus"), files.iterdir()))[start:end]
func = functools.partial(get_relevant_data, all_names_set)
    
with Pool() as pool:
    acc = pool.map(func,relevant)

In [51]:
len(pd.concat(acc))

468580

In [None]:
to_save = pd.concat(acc)

In [None]:
to_save.reset_index(drop=True, inplace=True)

In [52]:
del acc
del to_save

In [None]:
to_save.to_json(f"data_from_4500_to_5999")