In [75]:
import json
import os
import pandas as pd
import numpy as np
import folium
import ast 
import csv
from hill_name import clean
import re
from itertools import cycle

In [77]:
def extract_value(string):

    if string is None:
        return None
    string = string.replace("-", "-")
    string = string.replace("–", "-")
    string = string.replace("−", "-")
    string = string.split("±")[0]
    split_by_space = [r for r in re.split(' |(-)', string) if r]
    split_by_num = []
    for elem in split_by_space:
        split_by_num.extend([r for r in re.split('(\d+\.?(?:\d+)?)', elem) if r])
    if split_by_num[0] == "-":
        split_by_num[0] = "-" + split_by_num.pop(1)
    flag = 0
    new_split_by_num = []
    for index, value in enumerate(split_by_num):
        if flag == 2:
            new_split_by_num.append(split_by_num[index - 2])
            new_split_by_num.append(split_by_num[index - 1] + value)
            flag = 0
        elif flag == 1 and re.match('(-?\d+\.?(?:\d+)?)', value):
            new_split_by_num.append(split_by_num[index - 1])
            new_split_by_num.append(value)
            flag = 0
        elif not re.match('(-?\d+\.?(?:\d+)?)', value):
            flag += 1
        else:
            new_split_by_num.append(value)
    values = []
    for index, value in enumerate(new_split_by_num):
        try:
            float_val = float(value)
            values.append(float_val)
        except ValueError:
            pass

    return values

def extract_error(string):

    if string is None:
        return None
    string = string.replace("-", "-")
    string = string.replace("–", "-")
    string = string.replace("−", "-")
    string = string.replace(" ", "")
    split_by_num_and_error = [r for r in re.split('(\d+\.?(?:\d+)?)|(±)', string) if r]
    error = None
    for index, value in enumerate(split_by_num_and_error):
        if value == '±':
            try:
                 error = float(split_by_num_and_error[index + 1])
            except ValueError:
                pass
            except IndexError:
                pass

    
    return error

def remove_2013(string):
    return string.replace('\u2013','-').replace('\ue5f8','-').replace('\u2212','-')

abb = []
for line in open(r'C:\Users\Jiuyang Zhao\Documents\Chemical_Abbreviations.json', 'r', encoding="utf-8"):
    record = json.loads(line)
    abb.append(record)

In [19]:
raw_data = []
with open(r'F:\papers\refractive_index\rsc_refractive_index\demo\test.json', encoding='utf-8') as f:
    for line in f:
        raw_data.append(json.loads(line))

In [48]:
def post_process_data(raw_data,property_name='RefractiveIndex'):
        
    new_dic = {}
    
    if property_name in raw_data.keys():
        if isinstance(raw_data['metadata'],dict):
            new_dic['compound'] = raw_data[property_name]['compound']['Compound']['names']
            new_dic['specifier'] = raw_data[property_name]['specifier']
            new_dic['extracted_value'] = np.mean(extract_value(raw_data[property_name]['raw_value']))
            new_dic['raw_value'] = raw_data[property_name]['raw_value']
            new_dic['measurement_wavelength'] = raw_data[property_name].get('measured_wavelength',None)
            new_dic['DOI'] = raw_data['metadata'].get('doi',None)
            new_dic['Journal'] = raw_data['metadata'].get('journal',None)
            new_dic['Date'] = raw_data['metadata'].get('date',None)
            new_dic['Title'] = raw_data['metadata'].get('title',None)
            
        else:
            new_dic['compound'] = raw_data[property_name]['compound']['Compound']['names']
            new_dic['specifier'] = raw_data[property_name]['specifier']
            new_dic['extracted_value'] = np.mean(extract_value(raw_data[property_name]['raw_value']))
            new_dic['raw_value'] = raw_data[property_name]['raw_value']
            new_dic['measurement_wavelength'] = raw_data[property_name].get('measured_wavelength',None)
            new_dic['DOI'] = ast.literal_eval(raw_data['metadata']).get('doi',None)
            new_dic['Journal'] = ast.literal_eval(raw_data['metadata']).get('journal',None)
            new_dic['Date'] = ast.literal_eval(raw_data['metadata']).get('date',None)
            new_dic['Title'] = ast.literal_eval(raw_data['metadata']).get('title',None)

    elif 'confidence' in raw_data.keys():
        new_dic['compound'] = [raw_data['compound']['names']]
        new_dic['specifier'] = raw_data['specifier']
        new_dic['extracted_value'] = np.mean(extract_value(raw_data['raw_value']))
        new_dic['raw_value'] = raw_data['raw_value']
        new_dic['measurement_wavelength'] = raw_data.get('measured_wavelength',None)
        new_dic['DOI'] = raw_data.get('DOI',None)
       
    return new_dic


In [68]:
# Remove any record whose compound names are incomplete, contain invalid characters or end with abnormal words
dictionary_symbol = ['>','<','`','!','#','%','^','|','*','=',' -','- ',' –','– ']
processed_data = []
for data in raw_data:
    try:
        new_data = post_process_data(data)
        for i in dictionary_symbol:
            lst = new_data['compound']
            for name in lst:
                if i in name or len(name) == 1:
                    try:
                        data['compound'].remove(k)
                    except:
                        pass

        if 'compound' in new_data.keys():
            if len(new_data['compound']) >= 1:
                processed_data.append(new_data)
    except Exception as e:
        print(e)
        pass

In [71]:
df = pd.DataFrame(processed_data)
df = df.loc[df.astype(str).drop_duplicates().index]

In [72]:
# Remove any record whose refractive index has an extreme value (larger than 10 or less than 1)

df = df[(df['extracted_value'] <= 10) & (df['extracted_value'] >= 1)]

In [73]:
# Remove records that have abnormal specifiers.

specifier_symbols = [ ' R i ',' n F ','R I','nF','ri','nlit','nE','refractive','Refractive','refraction','Refraction',
                     'R.I.','RI','R.I','r.i.','r.i','Lit. R.I.','nm','μm','μm','nD','nav','n D','Real part (n)',
                     'Ref. index','n e','no','ne']

specifier_remove = ['n=','n =','n = ',]

def remove_specifier(string):
    if any(i in string for i in specifier_remove):
        return None
    if any(i in string for i in specifier_symbols) or string == 'n' or string == ' n ':
        return string
    else:
        return None
    
df['specifier'] = df['specifier'].apply(remove_specifier)
df = df[df['specifier'].notnull()]

In [83]:
# Remove records containing a refractive index which was extracted from an article whose title features the keywords "binary system" or "ternarysystem"

df = df[df['Title'].astype(str).apply(lambda x : not any(i in x for i in ['BINARYMIXTURE','TERNARYSYSTEM','TERNARYMIXTURE']))]

In [85]:
'''convert the extracted chemical names into standard for-mats. For organic chemicals,
it used the NLP tool "Open Parser for SystematicIUPAC Nomenclature" (OPSIN) to convert compound names 
to their simplified molecular-input line-entry system (SMILES) notation. For inorganic chem-icals, 
the subroutine used the National Cancer Institute’s Chemical IdentifierResolver (CIR) through their Python wrapper, CIRpy, 
to convert the inorganic compound names into the Hill Notation'''

def convert_name(names):
    try:
        for j in abb:
            if names[0] == j['Abbreviation']:
                return j['formula']
        return clean([remove_2013(names[0])])[0]
    except:
        return []

In [86]:
df['normalised_name'] = df['compound'].apply(convert_name)

In [90]:
df = df[df['normalised_name'].apply(lambda x : len(x) >= 1)]