In [1]:
import pandas as pd

In [52]:
data = pd.read_json("positive_samples_v1_160221")

In [53]:
journals = data.loc[:, ["issn", "journal_name", "journal_volume", "journal_issue"]]

Split journals into those with defined and undefined issn

In [54]:
journals.loc[(journals["issn"]=="")] = None

In [78]:
journals_w_issn = journals.loc[(pd.isna(journals["issn"])==False)]
journals_wo_issn = journals.loc[pd.isna(journals["issn"])==True]

In [56]:
from collections import defaultdict
issn_name = defaultdict(list)

for _, row in journals_w_issn.iterrows():
    if row["journal_name"] is not None and row["journal_name"] is not "":
        issn_name[row["issn"]].append(row["journal_name"])

In [57]:
for key, val in issn_name.items():
    issn_name[key] = pd.unique(val).tolist()

In [73]:
for key, val in issn_name.items():
    if len(val)>1:
        print(f"{key} : {val}")

### Standardise Names & Remove wrong entries!

In [71]:
name_to_standard = {'The Journal of Applied Ecology' : 'Journal of Applied Ecology',
                   'AGRICULTURE ECOSYSTEMS and ENVIRONMENT' : 'Agriculture, Ecosystems & Environment',
                    'Agriculture, Ecosystems and Environment' : 'Agriculture, Ecosystems & Environment',
                    'RESTORATION ECOLOGY' : 'Restoration Ecology',
                    'The Journal of Wildlife Management':'Journal of Wildlife Management',
                    'Waterbirds: The International Journal of Waterbird Biology' : 'Waterbirds',
                    'Animal conservation' : 'Animal Conservation',
                    'VEGETATIO' : 'Vegetatio',
                    'ICES Journal of Marine Science: Journal du Conseil': 'ICES Journal of Marine Science',
                    'The Journal of Animal Ecology':'Journal of Animal Ecology',
                    'Journal Of Insect Conservation' : 'Journal of Insect Conservation',
                    'Acta oecologica':'Acta Oecologica',
                    'PLOS ONE':'PLoS ONE',
                    'The Journal of Ecology':'Journal of Ecology',
                    'Mammalian Biology - Zeitschrift für Säugetierkunde':'Mammalian Biology',
                    'Animal welfare':'Animal Welfare',
                    'mammalia':'Mammalia',
                    'Knowledge and Management of Aquatic Ecosystems':'Knowledge & Management of Aquatic Ecosystems',
                    'AMBIO':'Ambio',
                    'Limnologica':'Limnologica - Ecology and Management of Inland Waters',
                    'Flora':'Flora - Morphology, Distribution, Functional Ecology of Plants',
                    'Bulletin français de la pêche et de la pisciculture' : 'Knowledge and Management of Aquatic Ecosystems' # CHANGED NAME 
                   }

wrong_issn = { #WRONG CONTENT
    '0767-2861' : 'Knowledge and Management of Aquatic Ecosystems', #Fixed! Correct - Name switch
    '1864-1547' : "Main Group Metal Chemistry", #Just a wrong name
    '0022-409X' : 'Rangeland Ecology & Management', #Wrong Name
    '1051-0761' : 'The Bulletin of the Ecological Society of America', #Wrong Name
}
real_issn = {
    'Rangeland Ecology & Management' : "1550-7424",
    "Main Group Metal Chemistry" : '2191-0219'
}

#WEIRD: "Journal of Cultural Economy"

In [59]:
# Trade for standard name
for key, val in issn_name.items():
    if len(val)>1:
        replace = []
        for el in val:
            replace.append(name_to_standard.get(el, el))
        issn_name[key] = pd.unique(replace).tolist()

In [72]:
# Delete wrong issn entries
for issn, val in wrong_issn.items():
        replace = list(set(issn_name[issn]) - set([val]))
        issn_name[issn] = pd.unique(replace).tolist()

In [89]:
# ISSN : [NAME] --> ISSN : NAME
issn_name = {issn:name[0] for issn, name in issn_name.items()}

### Process entries without issn

In [80]:
journals_wo_issn = journals_wo_issn.loc[pd.isna(journals_wo_issn["journal_name"])==False]

In [90]:
unknown_names = []
for key, row in journals_wo_issn.iterrows():
    if row["journal_name"] in issn_name.values():
        continue
    else:
        unknown_names.append(row["journal_name"])

In [91]:
unknown_names

['Zoological Science', 'Journal of Cultural Economy']

In [94]:
unknown_issn_name = {
   "2212-3830" : 'Zoological Science',
   #"0289-0003" : 'Zoological Science', #PRINT
   "1753-0369" : "Journal of Cultural Economy",
#     "1753-0350": "Journal of Cultural Economy"#PRINT
}

In [95]:
issn_name.update(unknown_issn_name)

In [96]:
issn_name

{'0021-8901': 'Journal of Applied Ecology',
 '0929-1393': 'Applied Soil Ecology',
 '1528-7092': 'Southeastern Naturalist',
 '0006-3207': 'Biological Conservation',
 '0006-3657': 'Bird Study',
 '0305-1838': 'Mammal Review',
 '0030-6053': 'Oryx',
 '0167-8809': 'Agriculture, Ecosystems & Environment',
 '0273-8570': 'Journal of Field Ornithology',
 '1061-2971': 'Restoration Ecology',
 '0003-6862': 'Applied Entomology and Zoology',
 '0888-8892': 'Conservation Biology',
 '0261-2194': 'Crop Protection',
 '0022-541X': 'Journal of Wildlife Management',
 '1524-4695': 'Waterbirds',
 '1051-0761': 'Ecological Applications',
 '0722-4060': 'Polar Biology',
 '1367-9430': 'Animal Conservation',
 '0018-506X': 'Hormones and Behavior',
 '0043-5643': 'The Wilson Bulletin',
 '0378-1127': 'Forest Ecology and Management',
 '0010-5422': 'The Condor',
 '0960-3115': 'Biodiversity and Conservation',
 '0030-5693': 'Ornis Scandinavica',
 '0042-3106': 'Vegetatio',
 '0959-2709': 'Bird Conservation International',
 '1

CHECK FOR UNIQUENESS OF ISSN

In [97]:
len(set(issn_name.keys()))

369

In [98]:
len(issn_name)

369

In [143]:
import json
from pathlib import Path

store_journal_issn_map = Path(".") / "issn_journal_map.json"
with store_journal_issn_map.open("w") as f:
    json.dump(issn_name, f)

In [144]:
store_journal_issn_map = Path(".") / "journal_name_to_standard.json"
with store_journal_issn_map.open("w") as f:
    json.dump(name_to_standard, f)

Standardise use of ISSN / JOURNAL NAMES

In [110]:
otherIndex = data.set_index(keys="issn", drop=False)

Fix wrong_issns!

In [114]:
otherIndex.loc['0767-2861', "issn"]='1961-9502' # Update to '1961-9502' Knowledge and Management of Aquatic Ecosystems	

UPDATE ALL NAMES WITH ISSNs

In [121]:
otherIndex.reset_index(inplace=True, drop=True)

In [126]:
for ind, row in otherIndex.iterrows():
    if pd.isna(row['issn']) == False and row['issn']!="":
        otherIndex.loc[ind, "journal_name"] = issn_name.get(row['issn'])

In [139]:
name_issn = {name : issn for issn, name in issn_name.items()}

In [132]:
for ind, row in otherIndex.loc[pd.isna(otherIndex["issn"])==True].iterrows():
    name = otherIndex.loc[ind, "journal_name"]
    if name in name_issn:
        otherIndex.loc[ind, "issn"] = name_issn[name]

In [135]:
otherIndex.loc[pd.isna(otherIndex["issn"])]

Unnamed: 0,title,authors,doi,publication_date,abstract,repo_identifier,language,publisher,journal_name,journal_volume,journal_issue,issn,url,index,abstract_origin,topic_classification
118,Impact of habitat management on grey partridge...,"[Pierre Mayot, Eve Corda, François Reitz, Elis...",10.1111/j.0021-8901.2004.00939.x,,1 The grey partridge is a species of conservat...,openaire,EN,Wiley,,,,,,144,repo,Birds; Farmland
950,Dispersal of carabid species along a linear se...,[H. Gruttke],10.1007/978-94-017-0968-2_45,,,openaire,EN,Springer Netherlands,,,,,,1252,invalid_repo,Farmland
2024,Cover Crops and Related Methods for Enhancing ...,"[Hugh A. Smith, J. M. Holland, P. G. Tillman]",10.1002/9781118231838.ch19,,This chapter contains sections titled:\n\n ...,openaire,EN,"John Wiley & Sons, Ltd",,,,,,2752,scraped,Farmland
2311,Relations of native and exotic species 5 years...,"[Timothy B. Harrington, David H. Peter]",10.2737/pnw-rp-589,,,openaire,EN,"U.S. Department of Agriculture, Forest Service...",,,,,,3097,repo,Farmland
2716,Provisioning and tourism in free-ranging Japan...,[Hiroyuki Kurita],10.1017/cbo9781139087407.005,,IntroductionPrimate-focused tourism in Japan b...,openaire,EN,Cambridge University Press,,,,,,3548,scraped,Mammals
3301,The swift fox reintroduction program in Canada...,"[Ludwig N.Carbyn, Harry J.Armbruster, CharlesM...",10.1017/cbo9780511623325.014,1994-10-27,"IntroductionThe swift fox (Vulpes velox), the ...",crossref,EN,Cambridge University Press,,,,,http://dx.doi.org/10.1017/cbo9780511623325.014,4246,scraped,Mammals
3306,Use of Electric Fencing and Associated Measure...,"[Fernando R. Tortato, Sandra M. C. Cavalcanti,...",10.1007/978-1-4614-0902-1_16,,The Pantanal of Brazil is an important area fo...,openaire,EN,Springer New York,,,,,,4253,repo,Mammals
4576,Hepatic iron accumulation over time in Europea...,"[Michael T. Maslanka, Susan D. Crissey, Susan ...",10.1638/1042-7260(2000)031[0491:hiaoti]2.0.co;2,,European starlings (Sturnus vulgaris) were use...,openaire,EN,,,,,,,5866,repo,Animals ex-situ
5756,Ecological husbandry and reproduction of the M...,[Daniel Pearson],10.3854/crm.6.a24p146,2013-11-27,,openaire,EN,Chelonian Research Foundation,,,,,,7264,repo,Animals ex-situ
6083,Implementing new portable touchscreen-setups t...,[Vanessa Schmitt],10.1101/316042,,<jats:title>Abstracts</jats:title><jats:p>To u...,openaire,EN,Cold Spring Harbor Laboratory,,,,,,7652,repo,Animals ex-situ


In [138]:
new_isbn_fields = {"0034-7744": "Revista de Biología Tropical",
                  "1442-7001" : "Ecological Management & Restoration."}

fill_empty_fields = { #ind, issn, journal_name
    118:"0021-8901",
    #950:""BOOK
    #2024:  BOOK
    #2311 Grey
    #2716 BOOK
    #3301 BOOK
    #3306 BOOK
    4576: "1042-7260",
    10972: "0969-997X",
    17769:"0034-7744",
    14407:"0034-7744",
    12747:"1442-7001",
    
}

issn_name.update(new_isbn_fields)

In [142]:
for ind, issn in fill_empty_fields.items():
    otherIndex.loc[ind, "issn"] = issn
    otherIndex.loc[ind, "journal_name"] = issn_name[issn]

Clear away any tags in titles / abstracts!

In [161]:
import re
pattern = re.compile(r"<jats:\w*>|</jats:\w*>")

In [157]:
test = otherIndex.loc[6083, "abstract"]

In [171]:
for ind, row in otherIndex.iterrows():
    if row["abstract"] is not None:
        otherIndex.loc[ind, "abstract"] = pattern.sub(" ", row["abstract"])
        

In [172]:
pattern = re.compile(r"<\w*>|</\w*>")

In [174]:
for ind, row in otherIndex.iterrows():
    if row["abstract"] is not None:
        otherIndex.loc[ind, "abstract"] = pattern.sub(" ", row["abstract"])

In [176]:
for ind, row in otherIndex.iterrows():
    if row["title"] is not None:
        otherIndex.loc[ind, "title"] = pattern.sub(" ", row["title"])

In [178]:
otherIndex

Unnamed: 0,title,authors,doi,publication_date,abstract,repo_identifier,language,publisher,journal_name,journal_volume,journal_issue,issn,url,index,abstract_origin,topic_classification
0,The Management of Grass Pastures for Brent Geese,"[J. A.Vickery, W. J.Sutherland, S. J.Lane]",10.2307/2404543,1994-5,An increasing number of brent geese now feed i...,crossref,EN,JSTOR,Journal of Applied Ecology,31,2,0021-8901,http://dx.doi.org/10.2307/2404543,0,repo,Farmland; Birds
1,Control of Molinia caerulea on upland moors,"[R. H.Marrs, J. D. P.Phillips, P. A.Todd, J.Gh...",10.1111/j.0021-8901.2004.00901.x,2004-4,1 Molinia encroachment has been viewed as a ma...,crossref,EN,Wiley,Journal of Applied Ecology,41,2,0021-8901,http://dx.doi.org/10.1111/j.0021-8901.2004.009...,2,repo,Shrubland
2,Long-distance relocation of nestboxes reduces ...,"[AlbertoSorace, FabrizioPetrassi, CarloConsiglio]",10.1080/00063650409461343,2004-7,Capsule Clutches of hole-nesting passerines su...,crossref,EN,Informa UK Limited,Bird Study,51,2,0006-3657,http://dx.doi.org/10.1080/00063650409461343,3,scraped,Birds
3,Reducing the density of breeding gulls influen...,"[S. K.Finney, M. P.Harris, L. F.Keller, D. A.E...",10.1046/j.1365-2664.2003.00810.x,2003-6,"1 By acting as both competitors and predators,...",crossref,EN,Wiley,Journal of Applied Ecology,40,3,0021-8901,http://dx.doi.org/10.1046/j.1365-2664.2003.008...,4,repo,Birds
4,Elements that promote highway crossing structu...,"[WayneMcDonald, Colleen CassadySt Clair]",10.1111/j.1365-2664.2004.00877.x,2004-2-12,1 \nCorridors provide important structural con...,crossref,EN,Wiley,Journal of Applied Ecology,41,1,0021-8901,http://dx.doi.org/10.1111/j.1365-2664.2004.008...,5,scraped,Mammals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18188,Historical peat loss explains limited short-te...,"[Jennifer Williamson, David Norris, Chris D. E...",10.1016/j.jenvman.2016.12.018,2017-3,This study assessed the short-term impacts of ...,openaire,EN,Elsevier BV,Journal of Environmental Management,188,,0301-4797,http://dx.doi.org/10.1016/j.jenvman.2016.12.018,20893,repo,Wetland
18189,Wood chip soil amendments in restored wetlands...,"[Evan C.Wolf, EliškaRejmánková, David J.Cooper]",10.1111/rec.12942,2019-4-2,Adding chipped wood to soil ameliorates compac...,crossref,EN,Wiley,Restoration Ecology,27,5,1061-2971,http://dx.doi.org/10.1111/rec.12942,20894,scraped,Wetland
18190,Is the cutting of oil contaminated marshes an ...,"[André L.T.O.Wolinski, Paulo C.Lana, LeonardoS...",10.1016/j.marpolbul.2011.03.024,2011-6,Cutting and removal of oil-impacted marsh plan...,crossref,EN,Elsevier BV,Marine Pollution Bulletin,62,6,0025-326X,http://dx.doi.org/10.1016/j.marpolbul.2011.03.024,20895,scraped,Wetland
18191,Restoration of inland brackish vegetation by l...,"[MinekeWolters, Saskiade Vries, Wim A.Ozinga, ...",10.1111/avsc.12323,2017-8-20,Question Does large-scale transfer of coastal ...,crossref,EN,Wiley,Applied Vegetation Science,20,4,1402-2001,http://dx.doi.org/10.1111/avsc.12323,20896,repo,Wetland


In [177]:
otherIndex.to_json("positive_samples_v2_170221_corrected_journals_removed_tags")

In [183]:
otherIndex = otherIndex.loc[otherIndex["issn"]!="0034-7744"]

In [None]:
otherIndex.to_json("positive_samples_v2_1_170221_removed_spani")

Get Data about Volume + Issue of Journals

In [2]:
import json
from pathlib import Path

data = pd.read_json("positive_samples_v2_170221_corrected_journals_removed_tags")
with open("issn_journal_map.json") as f:
    issn_name = json.load(f)

In [3]:
from collections import defaultdict
issn_vol_iss = defaultdict(list)

for ind, row in data.iterrows():
    if row['issn'] is not None:
        vol = row["journal_volume"]
        iss = row["journal_issue"]
        issn_vol_iss[row['issn']].append((vol, iss))

In [4]:
for key, val in issn_vol_iss.items():
    issn_vol_iss[key] = pd.unique(val).tolist()

In [5]:
for key, val in issn_vol_iss.items():
    issn_vol_iss[key] = [v for v in val if v[0] is not None and v[0]!=""]

In [13]:
with open("issn_vol_iss_map.json", "w") as f:
    json.dump(issn_vol_iss, f)

Get amount of entries per volume