In [3]:
import os
import json
from tqdm import tqdm

with open('final_pairs.json', 'r', encoding='utf-8') as f:
    pairs = json.load(f)
full_norm = {}
for p in pairs.keys():
    for n in pairs[p].keys():
        full_norm[n] = p    

In [4]:
ELEMENTS = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K',
                'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr',
                'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I',
                'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb',
                'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr',
                'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf',
                'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og', 'Uue', 'SiO2']

ELEMENT_NAMES = ['hydrogen', 'helium', 'lithium', 'beryllium', 'boron', 'carbon', 'nitrogen', 'oxygen', 'fluorine',
                     'neon', 'sodium', 'magnesium', 'aluminium', 'silicon', 'phosphorus', 'sulfur', 'chlorine', 'argon',
                     'potassium', 'calcium', 'scandium', 'titanium', 'vanadium', 'chromium', 'manganese', 'iron',
                     'cobalt', 'nickel', 'copper', 'zinc', 'gallium', 'germanium', 'arsenic', 'selenium', 'bromine',
                     'krypton', 'rubidium', 'strontium', 'yttrium', 'zirconium', 'niobium', 'molybdenum', 'technetium',
                     'ruthenium', 'rhodium', 'palladium', 'silver', 'cadmium', 'indium', 'tin', 'antimony', 'tellurium',
                     'iodine', 'xenon', 'cesium', 'barium', 'lanthanum', 'cerium', 'praseodymium', 'neodymium',
                     'promethium', 'samarium', 'europium', 'gadolinium', 'terbium', 'dysprosium', 'holmium', 'erbium',
                     'thulium', 'ytterbium', 'lutetium', 'hafnium', 'tantalum', 'tungsten', 'rhenium', 'osmium',
                     'iridium', 'platinum', 'gold', 'mercury', 'thallium', 'lead', 'bismuth', 'polonium', 'astatine',
                     'radon', 'francium', 'radium', 'actinium', 'thorium', 'protactinium', 'uranium', 'neptunium',
                     'plutonium', 'americium', 'curium', 'berkelium', 'californium', 'einsteinium', 'fermium',
                     'mendelevium', 'nobelium', 'lawrencium', 'rutherfordium', 'dubnium', 'seaborgium', 'bohrium',
                     'hassium', 'meitnerium', 'darmstadtium', 'roentgenium', 'copernicium', 'nihonium', 'flerovium',
                     'moscovium', 'livermorium', 'tennessine', 'oganesson', 'ununennium', 'silica']
for i, e in enumerate(ELEMENTS):
    full_norm[ELEMENT_NAMES[i]] = e
    larger = ELEMENT_NAMES[i][0].upper()+ELEMENT_NAMES[i][1:]
    full_norm[larger] = e

In [5]:
def return_m(full_norm, m):
    if m in full_norm.keys():
        return full_norm[m]
    else:
        return m

In [6]:
def reserve_material(dir_name):
    check_dir = 'manual check/'+dir_name+'_checked.txt'
    new = []
    with open(check_dir, 'r', encoding='utf-8') as f:
        new_data = f.readlines()
        for d in new_data:
            name = d.split('\t')[1]            
            new.append(name)
            if name[0].isupper():
                new.append(name[0].lower()+name[1:])
            else:
                new.append(name[0].upper()+name[1:])
    return new

In [8]:
# material_rank (frequency & doi)
dir_names = ['Hydrophilic_Elsevier_8921', 'Hydrophobic_Elsevier_13677', 'Oleophobic_Elsevier_581', 'Omniphobic_Elsevier_143', 
             'Photocatalytic_Elsevier_13437', 'SC_Elesever_2044']

for dir_name in dir_names:
    with open('material_names/'+dir_name+'/sen_dict.json', 'r', encoding='utf-8') as f:
        sen_dict = json.load(f)
    reserve = reserve_material(dir_name)
    print(reserve[:10])
    mat_rank = {}
    for s in tqdm(sen_dict.keys()):
        doi = sen_dict[s]['doi']
        for m in sen_dict[s]['materials'].keys():
            if m in full_norm.keys():
                m = return_m(full_norm, m)
                # print(m)
            # print(m)
            # print('\n')
            if m in reserve:
                if m not in mat_rank.keys():
                    mat_rank[m] = {}
                    mat_rank[m]['freq'] = 1
                    mat_rank[m]['doi'] = [doi]
                else:
                    mat_rank[m]['freq'] +=1
                    if doi not in mat_rank[m]['doi']:
                        mat_rank[m]['doi'].append(doi)
    ranked_ = sorted(mat_rank.items(), key=lambda item:item[1]['freq'], reverse=True)
    json_str = json.dumps(ranked_, indent=4)
    with open('doi_show/material_rank/'+dir_name+'.json', 'w', encoding='utf-8') as json_file:
        json_file.write(json_str)

  1%|▍                                                                        | 4312/676568 [00:00<00:32, 20990.26it/s]

['TiO2', 'tiO2', 'silica', 'Silica', 'PEG', 'pEG', 'SiO2', 'siO2', 'PVDF', 'pVDF']


100%|███████████████████████████████████████████████████████████████████████| 676568/676568 [00:37<00:00, 18001.16it/s]
  0%|▎                                                                       | 5013/1023296 [00:00<00:40, 25132.03it/s]

['PDMS', 'pDMS', 'silica', 'Silica', 'TiO2', 'tiO2', 'SiO2', 'siO2', 'ZnO', 'znO']


100%|█████████████████████████████████████████████████████████████████████| 1023296/1023296 [00:58<00:00, 17475.74it/s]
  8%|█████▋                                                                    | 3272/42823 [00:00<00:01, 32314.47it/s]

['TiO2', 'tiO2', 'SiO2', 'siO2', 'PVDF', 'pVDF', 'PDMS', 'pDMS', 'silica', 'Silica']


100%|█████████████████████████████████████████████████████████████████████████| 42823/42823 [00:01<00:00, 34630.71it/s]
 53%|██████████████████████████████████████▉                                   | 6206/11784 [00:00<00:00, 61822.60it/s]

['PVDF', 'pVDF', 'PDMS', 'pDMS', 'silica', 'Silica', 'SiO2', 'siO2', 'PTFE', 'pTFE']


100%|█████████████████████████████████████████████████████████████████████████| 11784/11784 [00:00<00:00, 58118.87it/s]
  0%|▏                                                                       | 2765/1350480 [00:00<00:49, 27472.91it/s]

['TiO2', 'tiO2', 'ZnO', 'znO', 'WO3', 'wO3', 'titania', 'Titania', 'CdS', 'cdS']


100%|██████████████████████████████████████████████████████████████████████| 1350480/1350480 [02:52<00:00, 7814.50it/s]
  2%|█▌                                                                       | 3543/163769 [00:00<00:09, 17441.65it/s]

['TiO2', 'tiO2', 'SiO2', 'siO2', 'PDMS', 'pDMS', 'ZnO', 'znO', 'silica', 'Silica']


100%|███████████████████████████████████████████████████████████████████████| 163769/163769 [00:09<00:00, 16628.59it/s]


In [9]:
# material_dictionary (abb as key, full names-freq-doi)
dir_names = ['Hydrophilic_Elsevier_8921', 'Hydrophobic_Elsevier_13677', 'Oleophobic_Elsevier_581', 'Omniphobic_Elsevier_143', 
             'Photocatalytic_Elsevier_13437', 'SC_Elesever_2044']

for dir_name in dir_names:
    with open('material_names/'+dir_name+'/sen_dict.json', 'r', encoding='utf-8') as f:
        sen_dict = json.load(f)
    reserve = reserve_material(dir_name)
    mat_dict = {}
    for s in tqdm(sen_dict.keys()):
        doi = sen_dict[s]['doi']
        for m in sen_dict[s]['materials'].keys():
            if m in full_norm.keys() or m in full_norm.values():
                abb_m = return_m(full_norm, m)
                if abb_m in reserve:
                    if abb_m not in mat_dict.keys():
                        mat_dict[abb_m] = {}
                        mat_dict[abb_m][m] = {}
                        mat_dict[abb_m][m]['freq'] = 1
                        mat_dict[abb_m][m]['doi'] = [doi]
                    else:
                        if m not in mat_dict[abb_m].keys():
                            mat_dict[abb_m][m] = {}
                            mat_dict[abb_m][m]['freq'] = 1
                            mat_dict[abb_m][m]['doi'] = [doi]
                        else:
                            mat_dict[abb_m][m]['freq'] +=1
                            if doi not in mat_dict[abb_m][m]['doi']:
                                mat_dict[abb_m][m]['doi'].append(doi)
    json_str = json.dumps(mat_dict, indent=4)
    with open('doi_show/material_dictionary/'+dir_name+'.json', 'w', encoding='utf-8') as json_file:
        json_file.write(json_str)

100%|███████████████████████████████████████████████████████████████████████| 676568/676568 [00:33<00:00, 19901.05it/s]
100%|█████████████████████████████████████████████████████████████████████| 1023296/1023296 [00:51<00:00, 19940.63it/s]
100%|█████████████████████████████████████████████████████████████████████████| 42823/42823 [00:01<00:00, 25433.34it/s]
100%|█████████████████████████████████████████████████████████████████████████| 11784/11784 [00:00<00:00, 31575.91it/s]
100%|█████████████████████████████████████████████████████████████████████| 1350480/1350480 [01:19<00:00, 17078.65it/s]
100%|███████████████████████████████████████████████████████████████████████| 163769/163769 [00:10<00:00, 15088.23it/s]


In [10]:
# durability, based on rank, mat as key, positive as sub key, freq+doi
dir_names = ['Hydrophilic_Elsevier_8921', 'Hydrophobic_Elsevier_13677', 'Oleophobic_Elsevier_581', 'Omniphobic_Elsevier_143', 
             'Photocatalytic_Elsevier_13437', 'SC_Elesever_2044']

for dir_name in dir_names:
    du_dict = {}
    with open('durability/'+dir_name+'_op.json', 'r', encoding='utf-8') as f:
        du = json.load(f)
    reserve = reserve_material(dir_name)
    for m in tqdm(du.keys()):
        for v in du[m].keys():# driver, barrier
            if v != 'rate':
                for s in du[m][v]:
                    doi = s[1]
                    sm = return_m(full_norm, m)
                    if sm in reserve:
                        if sm not in du_dict.keys():
                            du_dict[sm] = {}
                            du_dict[sm][v] = {}
                            du_dict[sm][v]['freq'] = 1
                            du_dict[sm][v]['doi'] = [doi]
                        else:
                            if v not in du_dict[sm].keys():
                                du_dict[sm][v] = {}
                                du_dict[sm][v]['freq'] = 1
                                du_dict[sm][v]['doi'] = [doi]
                            else:
                                du_dict[sm][v]['freq'] += 1
                                du_dict[sm][v]['doi'].append(doi)
    with open('doi_show/material_rank/'+dir_name+'.json', 'r', encoding='utf-8') as f:
        rank = json.load(f)
    new_dict = {}
    for r in rank:
        mat = r[0]
        if mat in du_dict.keys():
            new_dict[mat] = du_dict[mat]
    json_str = json.dumps(new_dict, indent=4)
    with open('doi_show/durability/'+dir_name+'.json', 'w', encoding='utf-8') as json_file:
        json_file.write(json_str)


100%|███████████████████████████████████████████████████████████████████████████| 1426/1426 [00:00<00:00, 13535.98it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2065/2065 [00:00<00:00, 14652.22it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 230/230 [00:00<00:00, 45977.02it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 56516.13it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1642/1642 [00:00<00:00, 13625.06it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 494/494 [00:00<00:00, 13138.12it/s]


In [18]:
# doi-centered, including nano state
for dir_name in dir_names:
    reserve = reserve_material(dir_name)
    # print(non_)
    doi_dict = {}
    with open('material_names/'+dir_name+'/sen_dict.json', 'r', encoding='utf-8') as f:
        sen_dict = json.load(f)
    for s in tqdm(sen_dict.keys()):
        doi = sen_dict[s]['doi']
        if doi not in doi_dict.keys():
            doi_dict[doi] = {}
            doi_dict[doi]['material'] = {}
            doi_dict[doi]['ca'] = []
            doi_dict[doi]['sa'] = []
            doi_dict[doi]['idx'] = []
            doi_dict[doi]['tr'] = []
            doi_dict[doi]['du'] = {}
        for m in sen_dict[s]['materials'].keys():
            m = return_m(full_norm, m)
            if m in reserve:
                if m not in doi_dict[doi]['material'].keys():
                    doi_dict[doi]['material'][m] = {}
                    doi_dict[doi]['material'][m]['freq'] = 1
                else:
                    doi_dict[doi]['material'][m]['freq'] += 1
    print('start type')
    doi_dict = update_type(doi_dict, sen_dict)
    print('start ca')
    doi_dict = update_ca(doi_dict, dir_name)
    print('start sa')
    doi_dict = update_sa(doi_dict, dir_name)
    print('start idx')
    doi_dict = update_idx(doi_dict, dir_name)
    print('start tran')
    doi_dict = update_tran(doi_dict, dir_name)
    print('start du')
    doi_dict = update_du(doi_dict, dir_name)
    json_str = json.dumps(doi_dict, indent=4)
    with open('doi_show/doi_center/'+dir_name+'.json', 'w', encoding='utf-8') as json_file:
        json_file.write(json_str)

100%|███████████████████████████████████████████████████████████████████████| 676568/676568 [00:31<00:00, 21231.75it/s]
  8%|█████▉                                                                 | 56481/676568 [00:00<00:02, 279821.39it/s]

start type


100%|██████████████████████████████████████████████████████████████████████| 676568/676568 [00:02<00:00, 276504.95it/s]


start ca
start sa
start idx
start tran
start du


100%|█████████████████████████████████████████████████████████████████████| 1023296/1023296 [00:43<00:00, 23276.06it/s]
  3%|█▊                                                                    | 26513/1023296 [00:00<00:03, 260336.65it/s]

start type


100%|████████████████████████████████████████████████████████████████████| 1023296/1023296 [00:03<00:00, 275124.57it/s]


start ca
start sa
start idx
start tran
start du


100%|█████████████████████████████████████████████████████████████████████████| 42823/42823 [00:01<00:00, 36752.28it/s]
100%|████████████████████████████████████████████████████████████████████████| 42823/42823 [00:00<00:00, 264159.60it/s]


start type
start ca
start sa
start idx
start tran
start du


100%|█████████████████████████████████████████████████████████████████████████| 11784/11784 [00:00<00:00, 60168.75it/s]
100%|████████████████████████████████████████████████████████████████████████| 11784/11784 [00:00<00:00, 279803.89it/s]


start type
start ca
start sa
start idx
start tran
start du


100%|█████████████████████████████████████████████████████████████████████| 1350480/1350480 [00:50<00:00, 26946.73it/s]
  4%|██▊                                                                   | 54204/1350480 [00:00<00:04, 269934.46it/s]

start type


100%|████████████████████████████████████████████████████████████████████| 1350480/1350480 [00:05<00:00, 267262.00it/s]


start ca
start sa
start idx
start tran
start du


100%|███████████████████████████████████████████████████████████████████████| 163769/163769 [00:09<00:00, 16552.66it/s]
 33%|███████████████████████▍                                               | 54160/163769 [00:00<00:00, 269601.29it/s]

start type


100%|██████████████████████████████████████████████████████████████████████| 163769/163769 [00:00<00:00, 269366.71it/s]


start ca
start sa
start idx
start tran
start du


In [25]:
# measure rank value as key, doi list
dir_names = ['Hydrophilic_Elsevier_8921', 'Hydrophobic_Elsevier_13677', 'Oleophobic_Elsevier_581', 'Omniphobic_Elsevier_143', 
             'Photocatalytic_Elsevier_13437', 'SC_Elesever_2044']
for dir_name in dir_names:
    value_doi = {}
    with open('index_update/'+dir_name+'_idx.json', 'r', encoding='utf-8') as f:
        sa = json.load(f)
    reserve = reserve_material(dir_name)
    for m in sa.keys():
        for v in sa[m].keys():
            for s in sa[m][v]:
                sen = s[0]
                doi = s[1]
                sm = return_m(full_norm, m)
                if sm in reserve:
                    if float(v) not in value_doi.keys():
                        value_doi[float(v)] = [(doi, sm)]
                    else:
                        if (doi, sm) not in value_doi[float(v)]:
                            value_doi[float(v)].append((doi, sm))
    ranked = sorted(value_doi.items(),key=lambda x:x[0], reverse=True)
    json_str = json.dumps(ranked, indent=4)
    with open('doi_show/idx/'+dir_name+'.json', 'w', encoding='utf-8') as json_file:
        json_file.write(json_str)

# support functions

In [12]:
# type
import re


def update_type(doi_dict, sen_dict):
    nano_types = ['nanoparticle', 'nanofiber', 'nanotube', 'nanorod', 'nanoplate', 'nanosheet', 'nanoribbon', 'nanocomposite', 'nanofoam', 
              'nanoporous', 'nanopores', 'nanocrystalline', 'nanocrystals', 'nanowires', 'nanotexture']
    type_dict = {}
    for s in tqdm(list(sen_dict.keys())):
        doi = sen_dict[s]['doi']
        for n in nano_types:
            if n in s:
                pos = [substr.start() for substr in re.finditer(n, s)]
                for p in pos:
                    min_dis = 100
                    min_mat = ''
                    for mat in sen_dict[s]['materials'].keys():
                        if abs(sen_dict[s]['materials'][mat][0]-p)<min_dis:
                            min_dis = abs(sen_dict[s]['materials'][mat][0]-p)
                            min_mat = mat
                    m  = return_m(full_norm, min_mat)
                    if m in doi_dict[doi]['material'].keys():
                        if n not in doi_dict[doi]['material'][m].keys():
                            doi_dict[doi]['material'][m][n] = 1
                        else:
                            doi_dict[doi]['material'][m][n] += 1
    return doi_dict

In [13]:
# ca
def update_ca(doi_dict, dir_name):
    with open('angle_update/'+dir_name+'_ca.json', 'r', encoding='utf-8') as f:
        ca = json.load(f)
    for m in ca.keys():
        for v in ca[m].keys():
            for s in ca[m][v]:
                sen = s[0]
                doi = s[1]
                sm = return_m(full_norm, m)
                if sm in doi_dict[doi]['material'].keys():
                    if v not in doi_dict[doi]['ca']:
                        doi_dict[doi]['ca'].append(v)
    return doi_dict

In [14]:
# sa
def update_sa(doi_dict, dir_name):
    with open('angle_update/'+dir_name+'_sa.json', 'r', encoding='utf-8') as f:
        sa = json.load(f)
    for m in sa.keys():
        for v in sa[m].keys():
            for s in sa[m][v]:
                sen = s[0]
                doi = s[1]
                sm = return_m(full_norm, m)
                if sm in doi_dict[doi]['material'].keys():
                    if v not in doi_dict[doi]['sa']:
                        doi_dict[doi]['sa'].append(v) 
    return doi_dict

In [15]:
# index
def update_idx(doi_dict, dir_name):
    with open('index_update/'+dir_name+'_idx.json', 'r', encoding='utf-8') as f:
        idx = json.load(f)
    for m in idx.keys():
        for v in idx[m].keys():
            for s in idx[m][v]:
                sen = s[0]
                doi = s[1]
                sm = return_m(full_norm, m)
                if sm in doi_dict[doi]['material'].keys():
                    if v not in doi_dict[doi]['idx'] and float(v) >= 1:
                        doi_dict[doi]['idx'].append(v)
    return doi_dict

In [16]:
# transmittance
def update_tran(doi_dict, dir_name):
    with open('transmittance_update/'+dir_name+'_tr.json', 'r', encoding='utf-8') as f:
        tr = json.load(f)
    for m in tr.keys():
        for v in tr[m].keys():
            for s in tr[m][v]:
                sen = s[0]
                doi = s[1]
                sm = return_m(full_norm, m)
                if sm in doi_dict[doi]['material'].keys():
                    if v not in doi_dict[doi]['tr']:
                        doi_dict[doi]['tr'].append(v)
    return doi_dict

In [17]:
# durability
def update_du(doi_dict, dir_name):
    with open('durability/'+dir_name+'_op.json', 'r', encoding='utf-8') as f:
        du = json.load(f)
    for m in du.keys():
        for v in du[m].keys():# driver, barrier
            if v != 'rate':
                for s in du[m][v]:
                    sen = s[0]
                    doi = s[1]
                    sm = return_m(full_norm, m)
                    if sm in doi_dict[doi]['material'].keys():
                        if sm not in doi_dict[doi]['du'].keys():
                            doi_dict[doi]['du'][sm] = {}
                            doi_dict[doi]['du'][sm][v] = 1
                        else:
                            if v not in doi_dict[doi]['du'][sm].keys():
                                doi_dict[doi]['du'][sm][v] = 1
                            else:
                                doi_dict[doi]['du'][sm][v] += 1
    return doi_dict