# Data cleaning

## Main dataset

In [2]:
import pandas as pd
from os.path import join
import numpy as np

### Read the data

In [29]:
src = "data"
dst = "data"
fname = "mapOSR_data_V5_9_3_220419_coded"
data = pd.read_csv(join(src, fname + "_raw.csv"))

### Clean the data

In [30]:
# rename columns & drop unneccesary columns
data = data\
    .rename(columns={'V=Action':'Action',
                     'V=Method':'Method',
                     'V=Discipline':'Discipline',
                     'V=Group':'Group',
                     'V=Geo-Scope':'Geo'})\
    .drop(columns=['Unnamed: 25', "Coder", "IS_Kommentar",
                   "DuplicateTitle", "DuplicateDOI",
                   "Comments", "Book Author Full Names", "Group Authors",
                   "Publication Type", "Cited References", "Book DOI",
                   "Book Editors"])

data["Abstract"] = data["Abstract"].apply(lambda x: x.replace("\n", ""))

In [31]:
# list of wrong codings and their respective corrections
miscodings_action = {
        'policies':'openpolicies', # actions
        'openacces':'openaccess', # actions
        'missing':np.nan # general missing entry code
}

miscodings_method = {
        'interviews':'interview', # methods
        'other(casestudies)':'othermeth', # methods
        'desk-review':'othermeth', # methods
        'deskresearch':'othermeth', # methods
        'bibliometric':'biblio', # methods
        'documentanalysis':'othermeth', # methods
        'other':'othermeth',
        'missing':np.nan # general missing entry code
}

miscodings_discipline = {
        'socsci':'socscie', # disciplines
        'socscie(LIS)':'socscie', # disciplines
        'socscie(psychology)':'socscie', # disciplines
        'socscie?(informationsciences)':'socscie', # disciplines
        'soscie':'socscie', # disciplines
        'socsie':'socscie', # disciplines
        'nonspecific':'nonspecificdisc',
        'missing':np.nan # general missing entry code
}

miscodings_group = {
        'librarians':'librarian', # group
        'publishers':'publisher', # group
        'reseaercher':'researcher', # group
        'researchers':'researcher', # group
        'all':'othergroup', # group
        'none':'othergroup', # group
        'other':'othergroup',
        'missing':np.nan # general missing entry code
}

miscodings_geo = {
        'TW':'TWN', # geo
        'IR':'IRN', # geo
        'IN':'IND', # geo
        'Italy':'ITA', # geo
        'PK':'PAK', # geo
        'SI':'SVN', # geo
        'LA':'SA', # geo
        'CHI':'CHN',
        'CND':'CAN',
        'DEN':'DNK',
        'NDL':'NLD',
        'SUI':'CHE',
        'HNK':'HKG',
        'GNR':'GRL',
        'nonspecific':'nonspecificgeo',
        'missing':np.nan # general missing entry code
}

miscodings = {
    "Action":miscodings_action,
    "Method":miscodings_method,
    "Discipline":miscodings_discipline,
    "Group":miscodings_group,
    "Geo":miscodings_geo
    
}

In [32]:
def split_list(list_string, cat_col):
    '''
    Takes a string containing a list of encodings separated by semicolons,
    cleans the list, splits it into different entries and returns a list
    of entries. Also corrects misspellings along the way.
    '''
    
    if list_string != list_string: # NaN check
        return np.nan
    
    list_string = list_string.strip(';') # remove trailing ";"
    list_string = list_string.replace(':', ';') # replace colon with semicolon
    list_string = list_string.replace(',', ';') # replace comma with semicolon
    raw_entries = list_string.split(';') # split list along ";"
    
    entries = []
    for e in raw_entries:
        category = ""
        e = e.replace(' ', '') # remove white spaces
        if '=' in e: # remove leading column code letter
            try:
                category, e = e.split('=')
            except ValueError:
                pass
        if e in miscodings[cat_col].keys(): # clean up wrong encodings
            e = miscodings[cat_col][e]
        if e != '' and e == e:
            entries.append(e)
    return entries

In [33]:
# clean up the five coded columns
cols = ['Action', 'Method', 'Discipline', 'Group', 'Geo']
for col in cols:
    data[col] = data[col].apply(split_list, args=[col])

### Sanity checks

Look at the remaining categories in each coded column

In [34]:
expected_actions = {
 'openaccess',
 'opendata',
 'openeducation',
 'openevaluation',
 'openmethod',
 'openparticipation',
 'openpolicies',
 'openscience',
 'opensoftware',
 'opentools'
}
actions = []
for a in data['Action']:
    if a == a:
        actions += a
actions = set(actions)

assert actions == expected_actions

In [35]:
expected_methods = {
    'biblio',
    'documentreview',
    'interview',
    'othermeth',
    'survey'
}

methods = []
for m in data['Method']:
    if m == m:
        methods += m
methods = set(methods)

assert methods == expected_methods

In [36]:
expected_disciplines = {
    'natscie',
    'engtech',
    'med',
    'agric',
    'socscie',
    'hum',
    'nonspecificdisc'
}

disciplines = []
for d in data['Discipline']:
    if d == d:
        disciplines += d
disciplines = set(disciplines)

assert disciplines == expected_disciplines

In [37]:
expected_groups = {
 'researcher',
 'librarian',
 'university',
 'unisupportstaff',
 'publisher',
 'policy',
 'funder',
 'business',
 'practitioner',
 'othergroup'
}

groups = []
for g in data['Group']:
    if g == g:
        groups += g
groups = set(groups)

assert groups == expected_groups

In [38]:
expected_geo = set(list(pd.read_csv(join(src, "country_codes.csv"), 
                        usecols=["iso_a3"],
                        na_values=["none"])["iso_a3"].fillna("NA").values)) \
    | {"nonspecificgeo"}

geo = []
for g in data['Geo']:
    if g == g:
        geo += g
geo = set(geo)

assert len(geo.difference(expected_geo)) == 0

### Dummy code the coded columns

In [39]:
for entries, colname in zip([actions, methods, disciplines, groups, geo], cols):
    for entry in entries:
        if entry == entry:
            data['{}_{}'.format(colname, entry)] = data[colname].apply(lambda x: entry in x if x == x else False)
            data['{}_{}'.format(colname, entry)] = data['{}_{}'.format(colname, entry)].replace({True:1, False:0})

  data['{}_{}'.format(colname, entry)] = data[colname].apply(lambda x: entry in x if x == x else False)


### Export the cleaned data

In [40]:
data.to_csv(join(dst, fname + "_clean.csv"), index=False)

## Interrater reliability dataset

### Read the data

In [42]:
src = "data"
dst = "data"
fname = "mapOSR_interrater_reliability"
data = pd.read_csv(join(src, fname + "_raw" + ".csv"))

### Clean the data

In [43]:
def split_list(list_string, cat_col):
    '''
    Takes a string containing a list of encodings separated by semicolons,
    cleans the list, splits it into different entries and returns a list
    of entries. Also corrects misspellings along the way.
    '''
    
    if list_string != list_string: # NaN check
        return np.nan
    
    list_string = list_string.strip(';') # remove trailing ";"
    list_string = list_string.replace(':', ';') # replace colon with semicolon
    list_string = list_string.replace(',', ';') # replace comma with semicolon
    raw_entries = list_string.split(';') # split list along ";"
    
    entries = []
    for e in raw_entries:
        category = ""
        e = e.replace(' ', '') # remove white spaces
        if '=' in e: # remove leading column code letter
            try:
                category, e = e.split('=')
            except ValueError:
                pass
        if e in miscodings[cat_col].keys(): # clean up wrong encodings
            e = miscodings[cat_col][e]
        if e != '' and e == e:
            entries.append(e)
    return entries

In [44]:
# list of wrong codings and their respective corrections
miscodings_action = {
        'openpolicy':'openpolicies', # actions
        'openacces':'openaccess', # actions
        'opentool':'opentools', # actions
        'missing':np.nan # general missing entry code
}
miscodings_method = {
        'documentanalysis':'documentreview', # methods
        'bibliometric':'biblio', # methods
        'other':'othermeth',
        'none':np.nan,
        'missing':np.nan # general missing entry code
}
miscodings_discipline = {
        'soscie':'socscie', # disciplines
        'socscie(psychology)':'socscie', # disciplines
        'all':'nonspecificdisc', # discipline
        'nonspecific':'nonspecificdisc',
        'none':np.nan,
        'missing':np.nan # general missing entry code
}
miscodings_group = {
        'librarians':'librarian', # group
        'publishers':'publisher', # group
        'researchers':'researcher', # group
        'other':'othergroup',
        'none':np.nan,
        'missing':np.nan # general missing entry code
}
miscodings_geo = {
        'IN':'IND', # geo
        '?':np.nan, # geo
        'LA':'SA', # geo
        'nonspecific':'nonspecificgeo',
        'all':'nonspecificgeo',
        'none':np.nan,
        'missing':np.nan # general missing entry code
}

miscodings = {
    'c1_action':miscodings_action,
    'c2_action':miscodings_action,
    'c1_method':miscodings_method,
    'c2_method':miscodings_method,
    'c1_discipline':miscodings_discipline,
    'c2_discipline':miscodings_discipline,
    'c1_group':miscodings_group,
    'c2_group':miscodings_group,
    'c1_geo':miscodings_geo,
    'c2_geo':miscodings_geo
}

In [45]:
# clean up the five coded columns
cols = ['c1_action', 'c1_method', 'c1_discipline', 'c1_group', 'c1_geo',
        'c2_action', 'c2_method', 'c2_discipline', 'c2_group', 'c2_geo']
for col in cols:
    data[col] = data[col].apply(split_list, args=[col])

### Sanity checks

Look at the remaining categories in each coded column

In [46]:
expected_actions = {
 'openaccess',
 'opendata',
 'openeducation',
 'openevaluation',
 'openmethod',
 'openparticipation',
 'openpolicies',
 'openscience',
 'opensoftware',
 'opentools'
}

for rater in ['c1', 'c2']:
    actions = []
    for a in data[f'{rater}_action']:
        if a == a:
            actions += a
    actions = set(actions)

    assert len(actions.difference(expected_actions)) == 0

In [47]:
expected_methods = {
    'biblio',
    'documentreview',
    'interview',
    'othermeth',
    'survey'
}

for rater in ['c1', 'c2']:
    methods = []
    for m in data[f'{rater}_method']:
        if m == m:
            methods += m
    methods = set(methods)

    assert len(methods.difference(expected_methods)) == 0

In [48]:
expected_disciplines = {
    'natscie',
    'engtech',
    'med',
    'agric',
    'socscie',
    'hum',
    'nonspecificdisc'
}

for rater in ['c1', 'c2']:
    disciplines = []
    for d in data[f'{rater}_discipline']:
        if d == d:
            disciplines += d
    disciplines = set(disciplines)

    assert len(disciplines.difference(expected_disciplines)) == 0

In [49]:
expected_groups = {
 'researcher',
 'librarian',
 'university',
 'unisupportstaff',
 'publisher',
 'policy',
 'funder',
 'business',
 'practitioner',
 'othergroup',
 'nonspecific'
}

for rater in ['c1', 'c2']:
    groups = []
    for g in data[f'{rater}_group']:
        if g == g:
            groups += g
    groups = set(groups)

    assert len(groups.difference(expected_groups)) == 0

In [50]:
expected_geo = set(list(pd.read_csv(join(src, "country_codes.csv"), 
                        usecols=["iso_a3"],
                        na_values=["none"])["iso_a3"].fillna("NA").values)) \
    | {"nonspecificgeo"}

for rater in ['c1', 'c2']:
    geo = []
    for g in data['c1_geo']:
        if g == g:
            geo += g
    geo = set(geo)
    
    assert len(geo.difference(expected_geo)) == 0

In [51]:
def join_entries(entry_list, category):
    if entry_list != entry_list:
        return np.nan
    entry_list = [f"{category}={l}" for l in entry_list]
    return "; ".join(entry_list)

In [52]:
data["c1_action"] = data["c1_action"].apply(join_entries, args=["a"])
data["c2_action"] = data["c2_action"].apply(join_entries, args=["a"])
data["c1_method"] = data["c1_method"].apply(join_entries, args=["m"])
data["c2_method"] = data["c2_method"].apply(join_entries, args=["m"])
data["c1_discipline"] = data["c1_discipline"].apply(join_entries, args=["d"])
data["c2_discipline"] = data["c2_discipline"].apply(join_entries, args=["d"])
data["c1_group"] = data["c1_group"].apply(join_entries, args=["t"])
data["c2_group"] = data["c2_group"].apply(join_entries, args=["t"])
data["c1_geo"] = data["c1_geo"].apply(join_entries, args=["geo"])
data["c2_geo"] = data["c2_geo"].apply(join_entries, args=["geo"])

### Export the cleaned data

In [53]:
data.to_csv(join(dst, fname + "_clean.csv"), index=False)