## Showcase of the C1 to CN transformation

In [1]:
import pandas as pd
from collections import defaultdict
import re
df = pd.read_pickle('../data/all_sdg.pkl')


In [2]:
for i, row in df.iterrows():
    print(row.C1)
    print('\n')
    if i == 100:
        break

[Pauw, W. P.] Frankfurt Sch Finance & Management, Frankfurt, Germany; [Pauw, W. P.] Univ Utrecht, Utrecht, Netherlands; [Pauw, W. P.] Stockholm Environm Inst, Stockholm, Sweden; [Pauw, W. P.] German Dev Inst, Bonn, Germany; [Castro, P.] Univ Zurich UZH, Zurich, Switzerland; [Pickering, J.] Univ Canberra, Canberra, ACT, Australia; [Bhasin, S.] CEEW, New Delhi, India


[Morozova, Irina M.] Volgograd State Tech Univ, Volgograd, Russia; [Litvinova, Tatiana N.] Volgograd State Agr Univ, Volgograd, Russia; [Przhedetskaya, Natalia, V; Sheveleva, Veronika V.] Rostov State Univ Econ, Rostov Na Donu, Russia


[Hourcade, Jean-Charles] CNRS, CIRED, EHESS, Nogent Sur Marne, France; [Dasgupta, Dipak] TERI, New Delhi, India; [Ghersi, Frederic] CNRS, CIRED, Nogent Sur Marne, France


[Larionova, M.; Safonkina, E.] Natl Econ & Publ Adm RANEPA, CIIR, Russian Presidential Acad, 11 Prechistenskaya Naberezhnaya, Moscow 119034, Russia; [Larionova, M.] Natl Res Univ, Fac World Econ & Int Affairs, Higher Sch 

In [95]:
# Regex with two groups. Authors group is optional as some C1 do not have any authors but the university address.
reg = re.compile(r"(\[(?P<authors>[^\[\]]+)])? (?P<country>[^\[\];]+);?")

In [96]:
# Special rules for different spelling in WOS
dic_country = {'Peoples R China': "China", 'England': 'United Kingdom',
               'Scotland': 'United Kingdom',
               'Wales': 'United Kingdom',
               'Northen Ireland': 'United Kingdom',
               'North Ireland': 'United Kingdom',
               'U Arab Emirates': 'United Arab Emirates',
               'Bosnia & Herceg': 'Bosnia and Herzegovina',
               'Trinidad Tobago': 'Trinidad and Tobago',
               'North Macedonia': 'Macedonia',
               'Papua N Guinea': 'Papua New Guinea',
               'DEM REP CONGO': 'Congo [DRC]',
               'Rep Congo': 'Congo [DRC]',
               'BELARUS': 'Belarus',
               'Cote Ivoire': 'Cote d\'Ivoire',
               'Marshall Island': 'Marshall Islands',
               'Dominican Rep': 'Dominican Republic',
               'Turks & Caicos': 'Turks and Caicos Islands',
               'St Helena': 'Saint Helena',
               'St Kitts & Nevi': 'Saint Kitts and Nevis',
               'St Vincent': 'Saint Vincent and the Grenadines',
               'Antigua & Barbu': 'Antigua and Barbuda',
               'Cent Afr Republ': 'Central African Republic',
               'Neth Antilles': 'Netherlands Antilles',
               }

In [80]:
def create_CN_string(txt):
    # Initializing a dic / we use a dic of set {C1 : set(authors), C2: set(authors)} in order to remove a double affiliation in the same country
    dic_affiliations = defaultdict(set)

    # finditer finds all iteration of our regex in the given text
    finditer = re.finditer(reg, txt)

    for elem in finditer:
        # Check for matches
        match_authors = elem.group('authors')
        match_country = elem.group("country")
        if match_authors:
            lst_authors = elem.group('authors').split(";")
        else:
            lst_authors = ["no author"]
        country = match_country.split(", ")[-1]
        # USA case that needs simplifying
        if "USA" in country:
            dic_affiliations["USA"] |= set(lst_authors)
        elif country in dic_country:
            dic_affiliations[dic_country[country]] |=  set(lst_authors)
        else:
            dic_affiliations[country] |=  set(lst_authors)
    # Prepping to return a string C1, C1, C1  len(set(authors)) times, C2 etc...
    return_lst = []
    for contr, set_authors in dic_affiliations.items():
        [return_lst.append(contr) for times in range(len(set_authors))]
    return_string = ", ".join(return_lst)
    return return_string


### Test cases
Test for case 3 : same author, different address, same country

In [87]:
test3 = "[Cuesta, Jose] World Bank, Washington, DC 20433 USA; [Cuesta, Jose] Georgetown Univ, Washington, DC 20057 USA; [Pizzolitto, Georgina] Yale Univ, New Haven, CT 06520 USA"
test3bis = "[Thanh Viet Nguyen; Tuyen Quang Tran] Vietnam Natl Univ, Univ Econ & Business, Hanoi, Vietnam; [Thanh Viet Nguyen] Hanoi Univ Nat Resources & Environm, Hanoi, Vietnam"

In [88]:
print(create_CN_string(test3))
print(create_CN_string(test3bis))

USA, USA
Vietnam, Vietnam


Test for case 4: same author, different address and different country

In [90]:
test4 = "[Cuesta, Jose] World Bank, Paris, France; [Cuesta, Jose] Georgetown Univ, Washington, DC 20057 USA; [Pizzolitto, Georgina] Yale Univ, New Haven, CT 06520 USA"
test4bis = "[Duclos, Jean-Yves] CSIC, Inst Analisi Econ, Barcelona, Spain; [Verdier-Chouchane, Audrey] African Dev Bank, Tunis, Tunisia; [Duclos, Jean-Yves] Univ Laval, Dept Econ, Quebec City, PQ G1V0A6, Canada"
print(create_CN_string(test4))
print(create_CN_string(test4bis))

France, USA, USA
Spain, Tunisia, Canada


Test for case where there is no author.

In [93]:
test5 = "Renmin Univ China, Sch Agr Econ & Rural Dev, Beijing 100872, Peoples R China"
test5bis = "Univ Appl Sci Western Switzerland, Sch Social Work HETS, CH-1211 Geneva 4, Switzerland"
print(create_CN_string(test5))
print(create_CN_string(test5bis))

China
Switzerland


Last test for the road

In [94]:
test6 = "[von Fintel, Marisa; Zoch, Asmus; van der Berg, Servaas] Stellenbosch Univ, Dept Econ, Room 508,5th Floor,Schumann Bldg,Bosman St, ZA-7600 Stellenbosch, South Africa ; Univ Appl Sci Western Switzerland, Sch Social Work HETS, CH-1211 Geneva 4, Switzerland"
print(create_CN_string(test6))

South Africa , South Africa , South Africa , Switzerland


In [3]:
# Test on small dataframe
df_test = pd.read_csv("../test_new_CN_column.csv", sep='\t', encoding='utf-8')
df_test

Unnamed: 0.1,Unnamed: 0,PT,AU,TI,SO,DE,AB,C1,EM,TC,...,AI,big_data,IOT,computing_infrastructure,blockchain,robotics,additive_manufacturing,Society,Economy,Environment
0,0,C,"Das, S; Mukhopadhyay, P",Multi-hazard disaster resilient housing with b...,7TH INTERNATIONAL CONFERENCE ON BUILDING RESIL...,Amphibious foundation; Assam-type building; Ba...,The state of Assam in the Brahmaputra valley i...,"[Das, Sutapa] Indian Inst Technol Kharagpur, D...",sutapa@arp.iitkgp.ernet.in,8,...,False,False,False,False,False,False,False,True,False,True
1,1,J,"Das, T",Does credit access lead to expansion of income...,INTERNATIONAL JOURNAL OF SOCIAL ECONOMICS,Rural credit; Heckit procedure; Income poverty...,Purpose The purpose of this paper is to evalua...,"[Das, Tiken] Indian Inst Publ Hlth, Shillong, ...",tikenhyd@gmail.com,6,...,False,False,False,False,False,False,False,True,True,False
2,2,J,"Paulson, K; Brazauskas, R; Khera, N; He, N; Ma...",Inferior Access to Allogeneic Transplant in Di...,BIOLOGY OF BLOOD AND MARROW TRANSPLANTATION,Access to transplantation; Health services res...,Allogeneic hematopoietic cell transplantation ...,"[Paulson, Kristjan; Szwajcer, David; Seftel, M...",kpaulson@cancercare.mb.ca,15,...,False,False,False,False,False,False,False,True,False,False
3,3,J,"Sudharsanan, N; Romano, S; Cunningham, SA",School Breakfast Receipt and Obesity among Ame...,JOURNAL OF THE ACADEMY OF NUTRITION AND DIETETICS,Childhood obesity; School breakfast; Nutrition,Background School breakfast consumption can im...,"[Sudharsanan, Nikkil] Univ Penn, Ctr Populat S...",nsud@sas.upenn.edu,6,...,False,False,False,False,False,False,False,True,False,False
4,4,J,"Abdel-Rahman, O",Socioeconomic predictors of suicide risk among...,CANCER EPIDEMIOLOGY,Suicide; Socioeconomic status; SES; United States,Objective: To assess the socioeconomic predict...,"[Abdel-Rahman, Omar] Univ Alberta, Cross Canc ...",omar.abdelsalam@ahs.ca,20,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,J,"Musal, M; Aktekin, T",Bayesian spatial modeling of HIV mortality via...,STATISTICS IN MEDICINE,HIV mortality; Bayesian Inference; spatial eff...,"In this paper, we investigate the effects of p...","[Aktekin, Tevfik] Univ New Hampshire, Dept Dec...",tevfik.aktekin@unh.edu,13,...,False,False,False,False,False,False,False,True,False,False
196,196,J,"Neckerman, KM; Garfinkel, I; Teitler, JO; Wald...",Beyond Income Poverty: Measuring Disadvantage ...,ACADEMIC PEDIATRICS,children; material hardship; poverty,The New York City (NYC) Longitudinal Study of ...,"[Neckerman, Kathryn M.; Garfinkel, Irwin; Teit...",kmn2@columbia.edu,39,...,False,False,False,False,False,False,False,True,False,False
197,197,J,"Aksman, E",How much would it cost to eliminate the at-ris...,ECONOMIC RESEARCH-EKONOMSKA ISTRAZIVANJA,the at-risk-of-poverty rate; cost of closing t...,The aim of this paper is to assess the cost of...,"[Aksman, Ewa] Warsaw Univ, Fac Econ, Warsaw, P...",aksman@wne.uw.edu.pl,0,...,False,False,False,False,False,False,False,True,False,False
198,198,C,"Sadyrtdinov, R; Rodnyansky, D; Strelnikova, T;...",Distribution of Households by Equivalent Incom...,PROCEEDINGS OF THE INTERNATIONAL CONFERENCE ON...,economics; equivalent income; poverty; region;...,Article presents the results of the Russian Fe...,"[Sadyrtdinov, Ruslan; Rodnyansky, Dmitry] Kaza...",s_ryslan@mail.ru; drodnyansky@gmail.com; kod21...,0,...,False,False,False,False,False,False,False,True,False,False


In [4]:
for i, row in df_test.iterrows():
    print(row.C1)
    print(row.CN)
    print('\n')

[Das, Sutapa] Indian Inst Technol Kharagpur, Dept Architecture & Reg Planning, Kharagpur 721302, W Bengal, India; [Mukhopadhyay, Parthasarathi] Indian Inst Engn Sci & Technol Shibpur, Dept Architecture Town & Reg Planning, Howrah 711103, India
India, India


[Das, Tiken] Indian Inst Publ Hlth, Shillong, Meghalaya, India
India


[Paulson, Kristjan; Szwajcer, David; Seftel, Matthew] Univ Manitoba, CancerCare Manitoba, 675 McDermot Ave, Winnipeg, MB R3E 0V9, Canada; [Brazauskas, Ruta] Med Coll Wisconsin, Inst Hlth & Soc, Div Biostat, Milwaukee, WI 53226 USA; [Brazauskas, Ruta; He, Naya; Saber, Wael] Med Coll Wisconsin, Ctr Int Blood & Marrow Transplant Res, Dept Med, Milwaukee, WI 53226 USA; [Khera, Nandita] Mayo Clin, Dept Hematol Oncol, Phoenix, AZ USA; [Majhail, Navneet] Cleveland Clin, Taussig Canc Inst, Blood & Marrow Transplant Program, Cleveland, OH 44106 USA; [Akpek, Gorgun] Rush Univ, Med Ctr, Dept Internal Med, Stem Cell Transplantat & Cell Therapy, Chicago, IL 60612 USA; [Aljur

In [5]:
for i, row in df_test.iterrows():
    if row.CN == "":
        print(row.C1)