In [4]:
import os
from tqdm import tqdm

def get_text(dir_name):
    text_dic = {}    
    elsevier = os.listdir(dir_name+'/')
    for e in tqdm(elsevier):
        with open(dir_name+'/'+e, 'r', encoding='utf-8') as f:
            data = f.readlines()
            doi = data[0][:-1]
            start_words = ['Graphical abstract','Abstract', 'Corresponding author', 'Correspondence to:', 'Introduction', 'Keywords']
            s_find = 0
            for s in start_words:
                if s in data[1]:
                    start = data[1].find(s)
                    s_find = 1
            if s_find == 0:
                start = 0
            if 'Reference' in data[1]:
                end = data[1].rfind('Reference')
            else:
                end = len(data[1])
            fulltext = data[1][start:end]                
            text_dic[e]={}
            text_dic[e]['text']=fulltext
            text_dic[e]['doi']=doi
    return text_dic

In [5]:
from chemdataextractor import Document
from tqdm import tqdm
from chemdataextractor.doc import Paragraph
import re

def get_material(sens):
    punc = ['–', ':', '/', '-']
    sen_mat = {}
    abb_sum = {}
    para = Paragraph(sens)
    for sen in para.sentences:
        cems_dic = {}
        tokens = sen.tokens
        doc = Document(sen.text)
        tmp = []
        if doc.cems:
            for c in doc.cems:
                if c.text.count(' ')<5:
                    tmp.append(c.start)
                    cems_dic[c.start] = {}
                    cems_dic[c.start]['name'] = c.text
                    cems_dic[c.start]['end'] = c.end   
        else:
            continue
        
        # collect fullname position and replace
        for tok in tokens:
            for t in tmp:
                # only word contain material name and conj punc can be fullname
                flag1 = 0
                flag2 = 0
                if cems_dic[t]['name'] in tok.text and cems_dic[t]['name']!=tok.text:
                    flag1 = 1
                for p in punc:
                    if p in tok.text[:-1]:
                        flag2 = 1
                if flag1 == 1 and flag2 == 1:
                    # print('this', tok.text, cems_dic[t]['name'])
                    try:
                        pos_list = re.finditer(tok.text, sen.text)
                    except:
                        continue
                    for pos in pos_list:
                        if pos.start() <= t and pos.end() >= cems_dic[t]['end']:
                            del cems_dic[t]
                            tmp.remove(t)
                            cems_dic[pos.start()]={}
                            cems_dic[pos.start()]['name'] = tok.text
                            cems_dic[pos.start()]['end'] = pos.end()
                                   
        sen_mat[sen.text] = {}
        sen_mat[sen.text]['materials'] = {}
        for c in cems_dic.keys():
            sen_mat[sen.text]['materials'][cems_dic[c]['name']] = [c, cems_dic[c]['end']]
        abb = doc.abbreviation_definitions
        if abb:
            for tup in abb:
                for name in sen_mat[sen.text]['materials'].keys():
                    if tup[1][0] in name:
                        if name not in abb_sum.keys():
                            abb_sum[name] = [tup[0][0]]
                        else:
                            abb_sum[name].append(tup[0][0])
    return sen_mat, abb_sum

In [6]:
# 一个字典是储存原句和里面具体material names和Position
# 一个字典是汇总对子及其doi

def return_dict(topic_dic):
    sen_sum = {}
    pair_sum = {}
    abb_sum = {}
    mat_sum = {}
    for o in tqdm(list(topic_dic.keys())):
        t = topic_dic[o]['text']
        sen_mat, abb_mat = get_material(t)
        for s in sen_mat.keys():
            sen_sum[s] = sen_mat[s]
            sen_sum[s]['doi'] = topic_dic[o]['doi']
        for abb in abb_mat.keys():
            if abb not in pair_sum.keys():
                pair_sum[abb] = {}
                for new in abb_mat[abb]:
                    pair_sum[abb][new] = [topic_dic[o]['doi']]
            else:
                for new in abb_mat[abb]:
                    if new not in pair_sum[abb].keys():
                        pair_sum[abb][new] = [topic_dic[o]['doi']]
                    else:
                        pair_sum[abb][new].append(topic_dic[o]['doi'])
    return sen_sum, pair_sum                

In [8]:
# Anti-soiling_Elsevier_53
# Antistatic_Elsevier_156
# Hydrophilic_Elsevier_8921
# Hydrophobic_Elsevier_13677
# Oleophobic_Elsevier_581
# Omniphobic_Elsevier_143
# Photocatalytic_Elsevier_13437
# SC_Elesever_2044
import os
import json

dir_name = 'Oleophobic_Elsevier_581'
ol_dict = get_text(dir_name)
sen_dict, pair_dict = return_dict(ol_dict)
full_mat_dict = {}
# 一个字典是总结Material names数量 withour abb, store a version
for s in sen_dict.keys():
    for mat in sen_dict[s]['materials']:
        if mat not in full_mat_dict.keys():
            full_mat_dict[mat] = 1
        else:
            full_mat_dict[mat] += 1
sorted_full = sorted(full_mat_dict.items(), key=lambda item:item[1], reverse=True)

if not os.path.exists('material_names/'+dir_name):
    os.makedirs('material_names/'+dir_name)
    
json_str = json.dumps(sen_dict, indent=4)
with open('material_names/'+dir_name+'/sen_dict.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_str)
    
json_str = json.dumps(pair_dict, indent=4)
with open('material_names/'+dir_name+'/pair_dict.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_str)
    
with open('material_names/'+dir_name+'/full_mat_dict.txt', 'w', encoding='utf-8') as f:
    for i, sf in enumerate(sorted_full):
        f.write(str(i+1)+'\t'+sf[0]+'\t'+str(sf[1]))
        f.write('\n')


100%|██████████| 581/581 [00:00<00:00, 928.14it/s]
100%|██████████| 581/581 [13:33<00:00,  1.40s/it]


# Combine material names and abbreviation

### summarize pairs

In [8]:
import os
import json

large_pairs = {}
for topic in os.listdir('material_names'):
    with open('material_names/'+topic+'/pair_dict.json', 'r', encoding='utf-8') as f:
        pair_dict = json.load(f)
        for p in pair_dict.keys():
            if p not in large_pairs:
                large_pairs[p] = pair_dict[p]
            else:
                for abb in pair_dict[p].keys():
                    if abb not in large_pairs[p].keys():
                        large_pairs[p][abb] = pair_dict[p][abb]
                    else:
                        for doi in pair_dict[p][abb]:
                            if doi not in large_pairs[p][abb]:
                                large_pairs[p][abb].append(doi)

final_pairs =   {}                              
for lp in large_pairs.keys():     
    max_len = 0
    for abb in large_pairs[lp].keys():
        if len(large_pairs[lp][abb]) > max_len:
            max_len = len(large_pairs[lp][abb])
            max_abb = abb
    if max_len > 5:
        if max_abb not in final_pairs.keys():
            final_pairs[max_abb] = {}
            final_pairs[max_abb][lp] = max_len
        else:
            final_pairs[max_abb][lp] = max_len
            
''' 
for f in final_pairs.keys():
    for abb in final_pairs[f].keys():
        flag = 1
        for c in f:
            if c.lower() not in abb.lower():
                flag = 0
        if flag == 0:
            print(f, abb)
    # print(f, final_pairs[f])
 
json_str = json.dumps(final_pairs, indent=4)
with open('final_pairs.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_str)
'''

" \nfor f in final_pairs.keys():\n    for abb in final_pairs[f].keys():\n        flag = 1\n        for c in f:\n            if c.lower() not in abb.lower():\n                flag = 0\n        if flag == 0:\n            print(f, abb)\n    # print(f, final_pairs[f])\n \njson_str = json.dumps(final_pairs, indent=4)\nwith open('final_pairs.json', 'w', encoding='utf-8') as json_file:\n    json_file.write(json_str)\n"

In [9]:
ELEMENTS = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K',
                'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr',
                'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I',
                'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb',
                'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr',
                'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf',
                'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og', 'Uue']

ELEMENT_NAMES = ['hydrogen', 'helium', 'lithium', 'beryllium', 'boron', 'carbon', 'nitrogen', 'oxygen', 'fluorine',
                     'neon', 'sodium', 'magnesium', 'aluminium', 'silicon', 'phosphorus', 'sulfur', 'chlorine', 'argon',
                     'potassium', 'calcium', 'scandium', 'titanium', 'vanadium', 'chromium', 'manganese', 'iron',
                     'cobalt', 'nickel', 'copper', 'zinc', 'gallium', 'germanium', 'arsenic', 'selenium', 'bromine',
                     'krypton', 'rubidium', 'strontium', 'yttrium', 'zirconium', 'niobium', 'molybdenum', 'technetium',
                     'ruthenium', 'rhodium', 'palladium', 'silver', 'cadmium', 'indium', 'tin', 'antimony', 'tellurium',
                     'iodine', 'xenon', 'cesium', 'barium', 'lanthanum', 'cerium', 'praseodymium', 'neodymium',
                     'promethium', 'samarium', 'europium', 'gadolinium', 'terbium', 'dysprosium', 'holmium', 'erbium',
                     'thulium', 'ytterbium', 'lutetium', 'hafnium', 'tantalum', 'tungsten', 'rhenium', 'osmium',
                     'iridium', 'platinum', 'gold', 'mercury', 'thallium', 'lead', 'bismuth', 'polonium', 'astatine',
                     'radon', 'francium', 'radium', 'actinium', 'thorium', 'protactinium', 'uranium', 'neptunium',
                     'plutonium', 'americium', 'curium', 'berkelium', 'californium', 'einsteinium', 'fermium',
                     'mendelevium', 'nobelium', 'lawrencium', 'rutherfordium', 'dubnium', 'seaborgium', 'bohrium',
                     'hassium', 'meitnerium', 'darmstadtium', 'roentgenium', 'copernicium', 'nihonium', 'flerovium',
                     'moscovium', 'livermorium', 'tennessine', 'oganesson', 'ununennium']

In [15]:
def combine_dic(sen_dict, pairs):
    full_mat_dict = {}
    for s in sen_dict.keys():
        for mat in sen_dict[s]['materials']:
            if mat not in full_mat_dict.keys():
                full_mat_dict[mat] = 1
            else:
                full_mat_dict[mat] += 1
    tmp_dict = {}
    store_ = []
    for f in full_mat_dict.keys():
        if f in pairs.keys():
            tmp_dict[f] = {}
            tmp_dict[f][f] = full_mat_dict[f]
            store_.append(f)
            for pk in pairs[f].keys():
                if pk in full_mat_dict.keys():
                    tmp_dict[f][pk] = full_mat_dict[pk]
                    store_.append(pk)
        else:
            if f in ELEMENTS:
                tmp_dict[f] = {}
                tmp_dict[f][f] = full_mat_dict[f]
                store_.append(f)
                idx = ELEMENTS.index(f)
                full_f = ELEMENT_NAMES[idx]
                if full_f in full_mat_dict.keys():
                    tmp_dict[f][full_f] = full_mat_dict[full_f]
                    store_.append(full_f)
                upper = full_f[0].upper()+full_f[1:]
                if upper in full_mat_dict.keys():
                    tmp_dict[f][upper] = full_mat_dict[upper]
                    store_.append(upper)
    for f in full_mat_dict.keys():
        if f not in store_:
            tmp_dict[f] = {}
            tmp_dict[f]['num'] = full_mat_dict[f]
    for t in tmp_dict.keys():
        if 'num' not in tmp_dict[t].keys():
            num = 0
            for k in tmp_dict[t].keys():
                num += tmp_dict[t][k]
            tmp_dict[t]['num'] = num
    return tmp_dict

In [26]:
# Anti-soiling_Elsevier_53
# Antistatic_Elsevier_156
# Hydrophilic_Elsevier_8921
# Hydrophobic_Elsevier_13677
# Oleophobic_Elsevier_581
# Omniphobic_Elsevier_143
# Photocatalytic_Elsevier_13437
# SC_Elesever_2044
import os
import json

dir_name = 'SC_Elesever_2044'
with open('material_names/'+dir_name+'/sen_dict.json', 'r', encoding='utf-8') as f:
    sen_dict = json.load(f)
with open('final_pairs.json', 'r', encoding='utf-8') as f:
    pairs = json.load(f)    
comb_dict = combine_dic(sen_dict, pairs)

json_str = json.dumps(comb_dict, indent=4)
with open('material_names/'+dir_name+'/com_dict.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_str)
    
sorted_com = sorted(comb_dict.items(), key=lambda item:item[1]['num'], reverse=True)
with open('material_names/'+dir_name+'/com_mat_dict.txt', 'w', encoding='utf-8') as f:
    for i, sf in enumerate(sorted_com):
        if sf[1]['num'] > 1:
            if len(sf[1].keys())>1:
                f.write(str(i+1)+'\t'+sf[0]+'\t'+str(sf[1]['num'])+'\t')
                del sf[1]['num']
                f.write(str(sf[1]))
            else:
                f.write(str(i+1)+'\t'+sf[0]+'\t'+str(sf[1]['num']))
            f.write('\n')