### Data wrangling and validation

In [None]:
import itertools
import joblib

import numpy as np
import pandas as pd

from scipy import sparse, stats
from mlutils import *

In [None]:
# Set to true to save intermediate files
SAVE_INTERMEDIATE_FILES = False
# Random seed
RANDOM_SEED = 56

In [None]:
df = pd.read_csv(r"dataset.csv")

### Data merging

In [None]:
dtypes = {
    'Abstract': str,
    'Title': str,
    'year': int,
    'documentType': str,
    'StoreId': str,
    'disc1': str,
    'disc2': str,
}

socab_df = pd.read_csv('Datasets/SocAbstracts.csv', dtype=dtypes)
eric_df = pd.read_csv('Datasets/ERIC.csv', dtype=dtypes)
econlit_df = pd.read_csv('Datasets/EconLit.csv', dtype=dtypes)

### Data cleaning and relabeling

Get clean and relabeled dataframes for each set:

In [None]:
socab_clean = clean_df(socab_df)
eric_clean = clean_df(eric_df)
econlit_clean = clean_df(econlit_df)

if SAVE_INTERMEDIATE_FILES:
    socab_clean.to_csv("SocAbstracts_master.csv", index=False)
    eric_clean.to_csv("ERIC_master.csv", index=False)
    econlit_clean.to_csv("EconLit_master.csv", index=False)

In [None]:
# which columns are stored?
socab_clean.columns

In [None]:
df = pd.concat([socab_clean,eric_clean,econlit_clean])
df = df.drop(columns=['year', 'disc1_x', 'disc1_counts', 'disc2_counts'])

if SAVE_INTERMEDIATE_FILES:
    # Transform list to semicolon-separated string prior to saving
    df['disc2_x'] = df.disc2_x.apply(lambda x: ';'.join(x))
    df.to_csv("dataset.csv", index=False)
    # Read file and transform back to list format
    df = pd.read_csv("dataset.csv")
    df['disc2_x'] = df.disc2_x.str.split(';')

df.to_csv("dataset.csv")

In [None]:
df.to_csv("dataset.csv")

In [None]:
df['text'] = df.Abstract.str.cat(df.Title, sep=' ')

Great, now we have now we have the data textual data to train and test the machine learning modules

### Checking the inter-indexer consistency

In [None]:
socab_eval = pd.read_excel("ExpertEvaluation/soc_ab_indexerconsis.xlsx", dtype=str)
vods = pd.read_excel("ExpertEvaluation/Vlaamse onderzoeksdisciplinelijst_V2018.xlsx", dtype=str)

In [None]:
# Value '0' represents NaN
socab_eval = socab_eval.replace('0', np.nan)

In [None]:
# check if all discipline codes are in official discipline codelist / no typos
codes = set(vods['Unnamed: 6'])

print('Are all labels in the original vods codelist?')
print('Expert labels:', all(socab_eval[f'expert_label{i}'].isin(codes).all() for i in range(1, 6)))
print('Expected labels:', all(socab_eval[f'expected_label{i}'].isin(codes).all() for i in range(1, 6)))

In [None]:
# create level 3 columns
for i in range(1, 6):
    expected, expert = f'expected_label{i}', f'expert_label{i}'
    
    try:
        socab_eval[f'expected_lv3label{i}'] = socab_eval[expected][socab_eval[expected].notna()].str[:-2]
        socab_eval[f'expert_lv3label{i}'] = socab_eval[expert][socab_eval[expert].notna()].str[:-2]
        
    except AttributeError:
        socab_eval[f'expected_lv3label{i}'] = pd.Series()
        socab_eval[f'expert_lv3label{i}'] = pd.Series()

In [None]:
expected_lv4 = [c for c in socab_eval.columns if c.startswith('expected_label')]
expert_lv4 = [c for c in socab_eval.columns if c.startswith('expert_label')]
expected_lv3 = [c for c in socab_eval.columns if c.startswith('expected_lv3label')]
expert_lv3 = [c for c in socab_eval.columns if c.startswith('expert_lv3label')]

In [None]:
def set_without_nan(row, cols):
    return set(row[cols][row[cols].notna()])

def consistency_score(row, level):
    if level == 4:
        expected, expert = expected_lv4, expert_lv4
        
    elif level == 3:
        expected, expert = expected_lv3, expert_lv3
        
    else:
        raise ValueError()

    return (
        2 * len(set_without_nan(row, expected) & set_without_nan(row, expert))
        / (len(set_without_nan(row, expected)) + len(set_without_nan(row, expert)))
    )

socab_eval['consistency_lvl4'] = socab_eval.apply(consistency_score, axis=1, level=4)
socab_eval['consistency_lvl3'] = socab_eval.apply(consistency_score, axis=1, level=3)

In [None]:
print("Inter-indexer consistency on level 3 = {}".format(sum(socab_eval.consistency_lvl3) / len(socab_eval)))
print("Inter-indexer consistency on level 4 = {}".format(sum(socab_eval.consistency_lvl4) / len(socab_eval)))