# Data cleaning

In [127]:
import pandas as pd
from os.path import join
import numpy as np

## Read the data

In [153]:
src = "../data"
dst = "../data"
fname = "coded_V1.csv"
data = pd.read_csv(join(src, fname))

## Clean the data

In [155]:
# rename columns & drop empty columns
data = data\
    .rename(columns={'V=Action':'Action',
                     'V=Method':'Method',
                     'V=Discipline':'Discipline',
                     'V=Group':'Group',
                     'V=Geo-Scope':'Geo'})\
    .drop(columns=['Unnamed: 25'])

In [130]:
# list of wrong codings and their respective corrections
miscodings = {
        'policies':'openpolicies', # actions
        'openacces':'openaccess', # actions
        'interviews':'interview', # methods
        'other(casestudies)':'other', # methods
        'desk-review':'other', # methods
        'deskresearch':'other', # methods
        'bibliometric':'biblio', # methods
        'documentanalysis':'other', # methods
        'socsci':'socscie', # disciplines
        'socscie(LIS)':'socsie', # disciplines
        'socscie(psychology)':'socscie', # disciplines
        'socscie?(informationsciences)':'socscie', # disciplines
        'soscie':'socscie', # disciplines
        'various':'all', # disciplines
        'clinicalmedicine':'med', # disciplines
        'librarians':'librarian', # group
        'publishers':'publisher', # group
        'researchers':'researcher', # group
        'reseearcher':'researcher', # group
        'reseaercher':'researcher', # group
        '262':np.nan, # group
        'AF':'Africa', # geo, spell out world regions
        'AS':'Asia', # geo, spell out world regions
        'EU':'Europe', # geo, spell out world regions
        'LA':'Latin America', # geo, spell out world regions
        'SA':'Latin America', # geo, spell out world regions
        'NA':'North America', # geo, spell out world regions
        'Canada':'CND', # geo
        'Italy':'ITA', # geo
        'Iran':'IRN', # geo
        'UG':'UGA', # geo
        'BG':'BGR', # geo
        'CN':'CHN', # geo
        'ES':'ESP', # geo
        'IN':'IND', # geo
        'IR':'IRN', # geo
        'GH':'GHA', # geo
        'KR':'KOR', # geo
        'NG':'NGA', # geo
        'PA':'PRY', # geo
        'SI':'SVN', # geo
        'TW':'TWN', # geo
        'TZ':'TZA', # geo
        'nan':np.nan, # general missing entry code
        'none':np.nan, # general missing entry code
        'missing':np.nan # general missing entry code
}

In [133]:
def split_list(list_string):
    '''
    Takes a string containing a list of encodings separated by semicolons,
    cleans the list, splits it into different entries and returns a list
    of entries. Also corrects misspellings along the way.
    '''
    if list_string != list_string: # NaN check
        return np.nan
    
    list_string = list_string.strip(';') # remove trailing ";"
    raw_entries = list_string.split(';') # split list along ";"
    
    entries = []
    for e in raw_entries:
        e = e.replace(' ', '') # remove white spaces
        # exception for "policy" entries in the "action" column
        if e == 'a=policies':
            e = 'a=openpolicies'
        # exception for "openacces" spelling mistake
        if e == "a=openacces":
            e = "a=openaccess"
        # exception for "bibliometric", should be "biblio"
        if e == "m=bibliometric":
            e = "m=biblio"
        # exception for socsie (LIS)
        if e == "d=socscie(LIS)":
            e = "d=socscie"
        # exception for "reseaercher" spelling mistake
        if e == "t=reseaercher":
            e = "t=researcher"
        if '=' in e: # remove leading column code letter
            e = e.split('=')[-1]
        if e in miscodings.keys(): # clean up wrong encodings
            e = miscodings[e]
        if e != '' and e == e:
            entries.append(e)
    return entries

In [156]:
# clean up the five coded columns
cols = ['Action', 'Method', 'Discipline', 'Group', 'Geo']
for col in cols:
    data[col] = data[col].apply(split_list)

## Sanity checks

Look at the remaining categories in each coded column

In [158]:
expected_actions = {
 'openaccess',
 'opendata',
 'openeducation',
 'openevaluation',
 'openmethod',
 'openparticipation',
 'openpolicies',
 'openscience',
 'opensoftware',
 'opentools'
}
actions = []
for a in data['Action']:
    if a == a:
        actions += a
actions = set(actions)

assert actions == expected_actions

In [159]:
expected_methods = {
    'biblio',
    'documentreview',
    'interview',
    'other',
    'survey'
}
methods = []
for m in data['Method']:
    if m == m:
        methods += m
methods = set(methods)

assert methods == expected_methods

In [160]:
expected_disciplines = {
    'agric',
    'all',
    'engtech',
    'hum',
    'med',
    'natscie',
    'socscie'
}

disciplines = []
for d in data['Discipline']:
    if d == d:
        disciplines += d
disciplines = set(disciplines)

assert disciplines == expected_disciplines

In [161]:
expected_groups = {
 'all',
 'business',
 'funder',
 'librarian',
 'other',
 'policy',
 'practitioner',
 'publisher',
 'researcher',
 'unisupportstaff',
 'university'
}
groups = []
for g in data['Group']:
    if g == g:
        groups += g
groups = set(groups)

assert groups == expected_groups

In [162]:
geo = []
for g in data['Geo']:
    if g == g:
        geo += g
geo = set(geo)

In [144]:
geo

{'?',
 'ARE',
 'ARG',
 'ARM',
 'AT',
 'AUS',
 'AUT',
 'Africa',
 'Asia',
 'BEL',
 'BGD',
 'BGR',
 'BOL',
 'BRA',
 'BWA',
 'CAN',
 'CH',
 'CHE',
 'CHI',
 'CHL',
 'CHN',
 'CND',
 'COL',
 'CRI',
 'CUB',
 'CZE',
 'DE',
 'DEN',
 'DEU',
 'DNK',
 'DZA',
 'EGY',
 'ESP',
 'EST',
 'ETH',
 'Europe',
 'FIN',
 'FRA',
 'GBR',
 'GHA',
 'GNR',
 'GRC',
 'HKG',
 'HNK',
 'HRV',
 'HUN',
 'IDN',
 'IND',
 'IRL',
 'IRN',
 'IRQ',
 'ISR',
 'ITA',
 'JAM',
 'JPN',
 'KEN',
 'KOR',
 'LBY',
 'LKA',
 'LTU',
 'Latin America',
 'MAR',
 'MEX',
 'MYS',
 'NDL',
 'NGA',
 'NLD',
 'NOR',
 'NZL',
 'North America',
 'PAK',
 'PER',
 'PHL',
 'PK',
 'PNG',
 'POL',
 'PRK',
 'PRT',
 'ROU',
 'RUS',
 'SAU',
 'SDN',
 'SEN',
 'SGP',
 'SRB',
 'SUI',
 'SVK',
 'SVN',
 'SWE',
 'THA',
 'TUN',
 'TUR',
 'TWN',
 'TZA',
 'UGA',
 'URY',
 'USA',
 'VEN',
 'VNM',
 'ZAF',
 'ZAR',
 'ZWE',
 'all'}

## Dummy code the coded columns

In [150]:
data

Unnamed: 0,ID,Database source,Coder,Article Title,Author Keywords,Abstract,action,method,discipline,group,...,Volume,Issue,Start Page,End Page,DOI,DuplicateDOI,Book DOI,UT (Unique WOS ID),Pubmed Id,Date of Export
0,5,WOS,stefan_skupien,Spanish Scholarly Journals in WoS and Scopus: ...,scholarly journals; Spain; open access; self-a...,The aim of this study was to determine the imp...,[openaccess],"[biblio, documentreview]",[all],[publisher],...,47,1,77.0,96,10.3138/jsp.47.1.04,,,WOS:000086362200007,,2021-01-27
1,6,WOS,stefan_skupien,Open access policies in Spanish universities,Open access; universities; Spain; OpenCourseWa...,The aim of this paper is to analyze the curren...,"[openaccess, openpolicies]","[survey, documentreview]",[],[university],...,36,2,,,10.3989/redc.2013.2.933,,,WOS:000087254500006,,2021-01-27
2,7,WOS,stefan_skupien,Open access monographs published by university...,University publishers; University presses; Ope...,The open access model for publishing monograph...,[openaccess],"[survey, interview]",[],"[publisher, university]",...,27,2,300.0,311,10.3145/epi.2018.mar.08,,,WOS:000088414100022,,2021-01-27
3,8,WOS,stefan_skupien,The Impact of Funding Insulin Research on Open...,Insulin; fund; open access; bibliometry; World...,Background: The recent report of The World Hea...,[openaccess],[biblio],[med],[researcher],...,16,5,497.0,502,10.2174/1573399815666190712201635,,,WOS:000088639900012,31538901.0,2021-01-27
4,9,WOS,stefan_skupien,Citation indicators' comparison of LIS open ac...,Library and information science; Open access; ...,Purpose There are different opinions about the...,[openaccess],[biblio],[socscie],"[publisher, researcher]",...,68,4-5,288.0,299,10.1108/GKMC-02-2018-0016,,,WOS:000089667600003,,2021-01-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,2948,ZOTERO,tamara_heck,Survey on open peer review: Attitudes and expe...,,Open peer review (OPR) is a cornerstone of the...,[openevaluation],[survey],[all],[researcher],...,12,12,,,https://doi.org/10.1371/journal.pone.0189311,,,,,2021-04-21
696,2949,ZOTERO,tamara_heck,Early-Career Researchers’ Perceptions of the P...,,Young researchers of today will shape the fiel...,[openscience],[survey],[socscie],[researcher],...,48,6,,,https://doi.org/10.1027/1864-9335/a000324,,,,,2021-04-21
697,2950,ZOTERO,tamara_heck,A Case Study of Scholars’ Open and Sharing Pra...,,Although the open scholarship movement has suc...,"[openaccess, opendata, openeducation]",[documentreview],[all],[researcher],...,7,3,,,http://dx.doi.org/10.5944/openpraxis.7.3.206,,,,,2021-04-21
698,2951,ZOTERO,tamara_heck,Open Science now: A systematic literature revi...,,Open Science is a disruptive phenomenon that i...,[openscience],[documentreview],[],[researcher],...,88,,,,https://doi.org/10.1016/j.jbusres.2017.12.043,,,,,2021-04-21


In [151]:
for entries, colname in zip([actions, methods, disciplines, groups, geo], cols):
    for entry in entries:
        if entry == entry:
            data['{}_{}'.format(colname, entry)] = data[colname].apply(lambda x: entry in x if x == x else False)
            data['{}_{}'.format(colname, entry)] = data['{}_{}'.format(colname, entry)].replace({True:1, False:0})

  data['{}_{}'.format(colname, entry)] = data[colname].apply(lambda x: entry in x if x == x else False)


## Export the cleaned data

In [112]:
data.to_csv(join(dst, 'V5_8_6_coded_V1_clean.csv'), index=False)