# 原理：
三个版本：一个是基于mat2vec的预测，一个是基于自己训练的vec的预测，一个是基于mat2vec+继续训练的预测  
方法：和self-cleaning近，以及hydrophobic等性质近

In [4]:
import os
from tqdm import tqdm

def get_text(dir_name):
    text_dic = {}    
    elsevier = os.listdir(dir_name+'/')
    for e in tqdm(elsevier):
        with open(dir_name+'/'+e, 'r', encoding='utf-8') as f:
            data = f.readlines()
            doi = data[0][:-1]
            start_words = ['Graphical abstract','Abstract', 'Corresponding author', 'Correspondence to:', 'Introduction', 'Keywords']
            s_find = 0
            for s in start_words:
                if s in data[1]:
                    start = data[1].find(s)
                    s_find = 1
            if s_find == 0:
                start = 0
            if 'Reference' in data[1]:
                end = data[1].rfind('Reference')
            else:
                end = len(data[1])
            fulltext = data[1][start:end]                
            text_dic[e]={}
            text_dic[e]['text']=fulltext
            text_dic[e]['doi']=doi
    return text_dic

# mat2vec版本

In [1]:
from gensim.models import Word2Vec


# Anti-soiling_Elsevier_53
# Antistatic_Elsevier_156
# Hydrophilic_Elsevier_8921
# Hydrophobic_Elsevier_13677
# Oleophobic_Elsevier_581
# Omniphobic_Elsevier_143
# Photocatalytic_Elsevier_13437
# SC_Elesever_2044
model = Word2Vec.load('mat2vec/training/models/pretrained_embeddings')
word_vectors = model.wv
# 因为词汇里没有self-cleaning, 所以用avg(self+clean)取代
# 因为词汇里没有anti-soiling, 所以用avg(anti+soiling)取代
proper = ['soiling', 'antistatic', 'hydrophilic', 'hydrophobic', 'oleophobic', 'omniphobic', 'photocatalytic']
self = word_vectors['self']
cleaning = word_vectors['cleaning']
vec1 = (self+cleaning)/2
similar1 = word_vectors.most_similar(positive=[vec1], topn=5000)
store = {}
for p in proper:
    if p == 'soiling':
        anti = word_vectors['anti']
        soiling = word_vectors['soiling']
        vec2 = (anti+soiling) / 2
        name = 'anti-soiling'
    else:
        vec2 = word_vectors[p]
        name = p
    store[name] = {}
    similar2 = word_vectors.most_similar(positive=[vec2], topn=5000)
    for i, s2 in enumerate(similar2):
        for j, s1 in enumerate(similar1):
            if s2[0] == s1[0]:
                store[name][s2[0]] = (i+j)/2

In [5]:
from mat2vec.processing import MaterialsTextProcessor
import nltk

text_processor = MaterialsTextProcessor()
dir_name = 'SC_Elesever_2044'
ol_dict = get_text(dir_name)
voc = []
for o in tqdm(ol_dict.keys()):
    text = ol_dict[o]['text']
    sens = nltk.sent_tokenize(text)
    for s in sens:
        processed, _ = text_processor.process(s)
        # print(processed)
        for p in processed:
            if p not in voc:
                voc.append(p)

100%|██████████| 2044/2044 [00:01<00:00, 1511.82it/s]
100%|██████████| 2044/2044 [30:33<00:00,  1.11it/s] 


In [28]:
for s in store.keys():
    with open('prediction/mat2vec/'+s+'_prediction.txt', 'w', encoding='utf-8') as f:
        sorted_store = sorted(store[s].items(), key=lambda item:item[1])
        for ss in sorted_store:
            if ss[0] in voc:
                judge = 1
            else:
                judge = 0
            f.write(ss[0]+'\t'+str(ss[1])+'\t'+str(judge)+'\n')

# 新训练版本

In [1]:
import json

with open('final_pairs.json', 'r', encoding='utf-8') as file:
    final_pairs = json.load(file)
full_ab = {}
for f in final_pairs.keys():
    for full in final_pairs[f].keys():
        full_ab[full] = f

In [2]:
import string
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')


def simple(words):
    simplified = []
    for w in words:
        if len(w) == 1:
            if w not in string.punctuation:
                simplified.append(w)
        else:
            if w not in stop: 
                simplified.append(w)
    return simplified

In [5]:
# 收集所有文本，把里面material branch替换成main; 把分开的两个词合起来
all_sens = []  
sc_voc = []
all_materials = []
for topic in os.listdir('material_names'):
    materials = []
    with open('material_names/'+topic+'/sen_dict.json', 'r', encoding='utf-8') as f:
        mat_sens = json.load(f)
        ol_dict = get_text(topic)
        for o in tqdm(ol_dict.keys()):
            text = ol_dict[o]['text']
            sens = nltk.sent_tokenize(text)
            for s in sens:
                if s in mat_sens.keys():
                    # print(s)
                    tmp = []
                    starts = []
                    ends = []
                    tmp_st = 0
                    for m in mat_sens[s]['materials'].keys():
                        starts.append(mat_sens[s]['materials'][m][0])
                        ends.append(mat_sens[s]['materials'][m][1])
                        if m in full_ab.keys():
                            if full_ab[m] not in materials:
                                materials.append(full_ab[m])
                        else:
                            if m not in materials:
                                materials.append(m)   
                    starts.sort()
                    ends.sort()
                    for i, st in enumerate(starts):
                        split = nltk.word_tokenize(s[tmp_st:st])
                        # print(split)
                        tmp.extend(split)
                        if s[st:ends[i]] in full_ab.keys():
                            tmp.append(full_ab[s[st:ends[i]]])
                            # print(full_ab[s[st:ends[i]]])
                        else:
                            tmp.append(s[st:ends[i]])
                            # print(s[st:ends[i]])
                        tmp_st = ends[i]
                    # print(nltk.word_tokenize(s[tmp_st:]))
                    tmp.extend(nltk.word_tokenize(s[tmp_st:]))
                    sim_tmp = simple(tmp)
                    all_sens.append(sim_tmp)
                    if topic == 'SC_Elesever_2044':
                        for st in sim_tmp:
                            if st not in sc_voc:
                                sc_voc.append(st)
                    # print(sim_tmp)
                else:
                    split = nltk.word_tokenize(s)
                    sim_split = simple(split)
                    all_sens.append(sim_split)
                    if topic == 'SC_Elesever_2044':
                        for st in sim_split:
                            if st not in sc_voc:
                                sc_voc.append(st)
    all_materials.extend(materials)
                

100%|██████████| 52/52 [00:00<00:00, 669.66it/s]
100%|██████████| 52/52 [00:02<00:00, 20.01it/s]
100%|██████████| 155/155 [00:00<00:00, 898.04it/s]
100%|██████████| 155/155 [00:06<00:00, 25.24it/s]
100%|██████████| 8921/8921 [00:10<00:00, 870.91it/s] 
100%|██████████| 8921/8921 [13:23<00:00, 11.10it/s]
100%|██████████| 13677/13677 [00:15<00:00, 875.55it/s]
100%|██████████| 13677/13677 [24:05<00:00,  9.46it/s]
100%|██████████| 581/581 [00:00<00:00, 864.74it/s]
100%|██████████| 581/581 [00:30<00:00, 18.82it/s]
100%|██████████| 143/143 [00:00<00:00, 688.63it/s]
100%|██████████| 143/143 [00:07<00:00, 20.00it/s]
100%|██████████| 13405/13405 [00:15<00:00, 891.81it/s] 
100%|██████████| 13405/13405 [22:38<00:00,  9.87it/s] 
100%|██████████| 2044/2044 [00:02<00:00, 908.45it/s]
100%|██████████| 2044/2044 [50:42<00:00,  1.49s/it]  


In [33]:
import os

all_materials = []
for topic in os.listdir('material_names'):
    materials = []
    with open('material_names/'+topic+'/sen_dict.json', 'r', encoding='utf-8') as f:
        mat_sens = json.load(f)
        for s in tqdm(mat_sens.keys()):
            for m in mat_sens[s]['materials'].keys():
                if m in full_ab.keys():
                    if full_ab[m] not in materials:
                        materials.append(full_ab[m])
                    else:
                        if m not in materials:
                            materials.append(m)
    all_materials.extend(materials)

100%|██████████| 2516/2516 [00:00<?, ?it/s]
100%|██████████| 10594/10594 [00:00<00:00, 706127.05it/s]
100%|██████████| 676568/676568 [00:01<00:00, 486521.31it/s]
100%|██████████| 1023296/1023296 [00:02<00:00, 482379.17it/s]
100%|██████████| 42823/42823 [00:00<00:00, 552858.53it/s]
100%|██████████| 11784/11784 [00:00<00:00, 743153.88it/s]
100%|██████████| 1350480/1350480 [00:02<00:00, 631072.21it/s]
100%|██████████| 163769/163769 [00:00<00:00, 666588.03it/s]


In [19]:
tmp = list(set(all_materials))
print(len(tmp))

226599


In [16]:
new_sens = []
for als in tqdm(all_sens):
    tmp = []
    for w in als:
        if w in change.keys():
            tmp.append(change[w])
        else:
            tmp.append(w)
    new_sens.append(tmp)

100%|██████████| 8036846/8036846 [02:02<00:00, 65413.79it/s] 


In [2]:
'''
json_str = json.dumps(new_sens, indent=4)
with open('all_sens.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_str)
json_str = json.dumps(new_vc, indent=4)
with open('sc_voc.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_str)
json_str = json.dumps(new_materials, indent=4)
with open('all_materials.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_str)
'''
with open('all_sens.json', 'r', encoding='utf-8') as json_file:
    all_sens = json.load(json_file)
with open('sc_voc.json', 'r', encoding='utf-8') as json_file:
    sc_voc = json.load(json_file)
with open('all_materials.json', 'r', encoding='utf-8') as json_file:
    all_materials = json.load(json_file)


In [3]:
from gensim.models import Word2Vec
import datetime

starttime = datetime.datetime.now()
our_model = Word2Vec(all_sens, size=300, window=12, workers=4)
our_model.save("our_embedding/word2vec_our.model")
# our_model = Word2Vec.load("our_embedding/word2vec_our.model")
endtime = datetime.datetime.now()
print (endtime - starttime).seconds

In [6]:
from tqdm import tqdm

word_vectors = our_model.wv
# print(word_vectors.vocab['TiO2 [2–4]'].count)
proper = ['anti-soiling', 'antistatic', 'hydrophilic', 'hydrophobic', 'oleophobic', 'omniphobic', 'photocatalytic']
vec1 = word_vectors['self-cleaning']
similar1 = word_vectors.most_similar(positive=[vec1], topn=10000)

store = {}
for p in proper:
    vec2 = word_vectors[p]
    store[p] = {}
    similar2 = word_vectors.most_similar(positive=[vec2], topn=10000)
    for i, s2 in tqdm(enumerate(similar2)):
        for j, s1 in enumerate(similar1):
            if s2[0] == s1[0] and s2[0] in all_materials:
                store[p][s2[0]] = (i+j)/2

10000it [00:39, 255.45it/s]
10000it [00:36, 271.96it/s]
10000it [00:29, 343.10it/s]
10000it [00:30, 323.06it/s]
10000it [00:34, 292.34it/s]
10000it [00:34, 286.28it/s]
10000it [00:33, 295.89it/s]


In [13]:
'''
new_materials = []
change = {}
for m in tqdm(all_materials):
# for m in sc_voc:
    if filt(m):
        nm = filt2(m)
        if nm!='delete':
            if nm not in new_materials:
                new_materials.append(nm)
            if nm!=m:
                change[m] = nm
print(len(sc_voc))
new_vc = []
for m in sc_voc:
    if m in change.keys():
        new_vc.append(change[m])
    else:
        new_vc.append(m)
'''


100%|██████████| 325823/325823 [27:54<00:00, 194.59it/s] 


In [10]:
def filt(word):
    contain_eng = 0
    for c in word:
        if c.isalpha():
            contain_eng = 1
    if contain_eng == 0 or ' or ' in word:
        return False
    else:
        return True

In [11]:
def filt2(word):
    if '[' in word and ']' in word:
        if word.index('[') > word.index(']'):
            return 'delete'
        else:
            for_check = word[word.index('[')+1:word.index(']')]
            if filt(for_check):
                return word
            else:
                nc = word[word.index('['):word.index(']')+1]
                new_word = word.replace(nc, '').strip()
                return new_word
    else:
        return word

In [7]:
for s in tqdm(store.keys()):
    with open('prediction/ourvec/'+s+'_prediction.txt', 'w', encoding='utf-8') as f:
        sorted_store = sorted(store[s].items(), key=lambda item:item[1])
        for ss in sorted_store:
            if ss[0] in sc_voc:
                judge = 1
            else:
                judge = 0
            f.write(ss[0]+'\t'+str(ss[1])+'\t'+str(judge)+'\n')

100%|██████████| 7/7 [00:01<00:00,  3.82it/s]


# 继续训练版本

In [9]:
# normalize原来的材料词
from mat2vec.processing import MaterialsTextProcessor

text_processor = MaterialsTextProcessor()
pro_materials = []
pro = {}
for m in tqdm(all_materials):
    processed, _ = text_processor.process(m)
    if len(processed)>1:
        pro_materials.append(m)
    else:
        pro_materials.append(processed[0])
        pro[m] = processed[0]

100%|██████████| 216096/216096 [00:36<00:00, 5914.83it/s]


In [12]:
pro_sens = []
for als in tqdm(all_sens):
    tmp = []
    for w in als:
        if w in pro.keys():
            tmp.append(pro[w])
        else:
            tmp.append(w)
    pro_sens.append(tmp)

100%|██████████| 8036846/8036846 [00:49<00:00, 163091.21it/s]


In [13]:
new_model = Word2Vec.load("our_embedding/word2vec_our.model")
new_model.build_vocab(pro_sens, update=True)
new_model.train(pro_sens,total_examples=new_model.corpus_count,epochs=1)
new_vectors = new_model.wv


In [22]:
proper = ['anti-soiling', 'antistatic', 'hydrophilic', 'hydrophobic', 'oleophobic', 'omniphobic', 'photocatalytic']
vec1 = new_vectors['self-cleaning']
similar1 = new_vectors.most_similar(positive=[vec1], topn=10000)

store = {}
for p in proper:
    vec2 = new_vectors[p]
    store[p] = {}
    similar2 = new_vectors.most_similar(positive=[vec2], topn=10000)
    for i, s2 in tqdm(enumerate(similar2)):
        for j, s1 in enumerate(similar1):
            if s2[0] == s1[0] and if_material(s2[0]):
                store[p][s2[0]] = (i+j)/2

10000it [00:31, 321.61it/s]
10000it [00:29, 333.54it/s]
10000it [00:27, 369.31it/s]
10000it [00:28, 350.07it/s]
10000it [00:28, 348.23it/s]
10000it [00:28, 348.19it/s]
10000it [00:28, 355.37it/s]


In [23]:
change_pro={v:k for k, v in pro.items()}
for s in tqdm(store.keys()):
    with open('prediction/ourvec_keep/'+s+'_prediction.txt', 'w', encoding='utf-8') as f:
        sorted_store = sorted(store[s].items(), key=lambda item:item[1])
        for ss in sorted_store:
            if ss[0] in change_pro.keys():
                if change_pro[ss[0]] in sc_voc:
                    judge = 1
                else:
                    judge = 0
            else:
                if ss[0] in sc_voc:
                    judge = 1
                else:
                    judge = 0
            f.write(ss[0]+'\t'+str(ss[1])+'\t'+str(judge)+'\n')

100%|██████████| 7/7 [00:04<00:00,  1.54it/s]


In [16]:
from chemdataextractor import Document

def if_material(word):
    doc = Document(word)
    if doc.cems:
        return True
    else:
        return False

In [21]:
print(if_material('PDRC'))

True
