## Get full texts and abstracts

In [14]:
# 处理xml files为meta data的dict
import os
from chemdataextractor import Document
from chemdataextractor.scrape import Selector
from chemdataextractor.reader.markup import XmlReader
from chemdataextractor.doc import Document, Title, Heading, Paragraph, Citation, Table, Figure, Caption, Footnote
import xml.etree.ElementTree as ET
from tqdm import tqdm

xml_list = os.listdir('xml_files')
xml_dict = {}
for x in tqdm(xml_list):
    f = open('xml_files/'+x, 'rb')
    doc = Document.from_file(f)
    headings = []
    heading_idx = []
    paras = []
    para_idx = []
    for i, e in enumerate(doc.elements):
        if type(e) == Title:
            str_title = str(e).replace('\n', ' ')
        if type(e) == Heading:
            str_heading = str(e).replace('\n', ' ')
            headings.append(str_heading)
            heading_idx.append(i)
        if type(e) == Paragraph:
            str_para = str(e).replace('\n', ' ')
            if len(str_para.split(' '))>50:
                paras.append(str_para)
                para_idx.append(i)
    tree = ET.parse('xml_files/'+x)
    root = tree.getroot()
    x_doi = tree.find('./front/article-meta/article-id').text
    xml_dict[x_doi] = {}
    ab_find = tree.find('./front/article-meta/abstract')
    if ab_find != None:
        for node in ab_find:
            if node.tag == 'p':
                ab_s = "".join(t for t in node.itertext()).replace('\n', ' ')
                # 摘要
                xml_dict[x_doi]['abstract'] = ab_s
    time = []
    for node in tree.find('./front/article-meta/history'):
        if node.attrib['date-type']=='received':
            time.append(int(node[2].text))
            time.append(int(node[1].text))
    # 时间
    xml_dict[x_doi]['time'] = time
    authors = []
    for node in tree.find('./front/article-meta/contrib-group'):
        if node.get('contrib-type') != None and node.attrib['contrib-type'] == 'author':
            name_find = node.find('name')
            if name_find != None:
                single_name = ''
                for n in name_find:
                    single_name+=n.text+' '
                authors.append(single_name)
    # 作者
    if authors != []:
        xml_dict[x_doi]['authors'] = authors
    keywords = []
    key_find = tree.find('./front/article-meta/kwd-group')
    if key_find != None:
        for node in key_find:
            if node.tag == 'kwd':
                keyword = "".join(t for t in node.itertext()).replace('\n', ' ')
                keywords.append(keyword)
        # 关键词
        xml_dict[x_doi]['keywords'] = keywords
    # 主标题
    xml_dict[x_doi]['title'] = str_title
    # 小标题及其序号
    xml_dict[x_doi]['headings'] = headings
    xml_dict[x_doi]['heading_idx'] = heading_idx
    # 段落及其序号
    xml_dict[x_doi]['paras'] = paras
    xml_dict[x_doi]['para_idx'] = para_idx
    
# key=doi, 'title', 'headings', 'heading_idx', 'paras', 'para_index', 'abstract', 'time', 'authors', 'keywords'

100%|██████████| 10321/10321 [13:00<00:00, 13.22it/s] 


In [3]:
import json
# 将abstracts中和xml重复的删去
with open('xml_dict.json', 'r', encoding='utf-8') as f:
    xml_dict = json.load(f) 
with open('passivating_abstract.json', 'r', encoding='utf-8') as f:
    abstract_dict = json.load(f)
print(len(abstract_dict.keys()))
for i in xml_dict.keys():
    if i in abstract_dict.keys():
        del abstract_dict[i]
print(len(abstract_dict.keys()))

# 所有的abstract
all_abstract = []
for x in xml_dict.keys():
    if 'abstract' in xml_dict[x].keys():
        all_abstract.append(xml_dict[x]['abstract'])
for i in abstract_dict.keys():
    all_abstract.append(abstract_dict[i])
print(len(all_abstract))

89046
86614
96850


## Dictionary for Normalization

In [8]:
# 跑一遍abstracts准备一个把化合物全称变简称的词典 key=full_name, values = [abbs]
from chemdataextractor import Document
from tqdm import tqdm

abb_dict = {}
for i in tqdm(all_abstract):
    abb = Document(i).abbreviation_definitions
    if abb != []:
        for tup in abb:
            if tup[-1] == 'CM':
                abb_name = ' '.join(tup[0])
                quan_name = ' '.join(tup[1])
                quan_name = quan_name.replace(' - ', '-')
                if quan_name not in abb_dict.keys():
                    abb_dict[quan_name] = [abb_name]
                else:
                    if abb_name not in abb_dict[quan_name]:
                        abb_dict[quan_name].append(abb_name)
# 该步结果存为json

100%|██████████| 96850/96850 [54:43<00:00, 29.50it/s]  


In [17]:
with open('abb_dict.json', 'r', encoding='utf-8') as f:
    abb_dict = json.load(f)
    
# 将键值对合并，[tmp_dict1, tmp_dict2], 每个dict['full_name':[], 'abb':[]]
pair_list = []
already = []
for i, full in enumerate(list(abb_dict.keys())):
    if i not in already:
        full_list = [full]
        abb_list = abb_dict[full]
        already.append(i)
        for abb in abb_dict[full]:
            for j, value in enumerate(abb_dict.values()):
                if abb in value and i!=j: 
                    full_list.append(list(abb_dict.keys())[j])
                    for v in value:
                        if v not in abb_list:
                            abb_list.append(v)
                    already.append(j)
        tmp_dict = {'full':full_list, 'abb':abb_list}
        pair_list.append(tmp_dict)

In [25]:
# 对于每一个简称，取下属囊括全称最长的作为normalized简称, 记作max_abb加入tmp_dict
for d in pair_list:
    if len(d['abb']) != 1:
        count_dict={}
        for abb in d['abb']:
            count = 0
            for value in abb_dict.values():
                if abb in value:
                    count += 1
            count_dict[abb] = count
        sort_count = sorted(count_dict.items(), key=lambda item:item[1], reverse=True)
        max_abb = sort_count[0][0]
        d['max_abb'] = max_abb  
    else:
        d['max_abb'] = d['abb'][0]
        
pair_list.sort(key= lambda x:len(x['full'][0]),reverse=True)
# 该步结果存为pair_list.json

In [5]:
# normalize function
def normalize(text, pairs):
    for d in pairs:
        for full in d['full']:
            if full in text:
                text = text.replace(' '+full+' ', ' '+d['max_abb']+' ')
                
        for abb in d['abb']:
            kuo_abb = ' ('+abb+')'
            if kuo_abb in text:
                text = text.replace(kuo_abb, '')
            if abb != d['max_abb']:                    
                space_abb = ' '+abb+' '
                if space_abb in text:
                    text = text.replace(space_abb, ' '+d['max_abb']+' ')
    return text

## Preprocessing

In [1]:
import nltk

def sen_seg(data):    
    to_replace = ['et al. ', 'Fig. ', 'e.g. ', 'i.e. ', 'Ref. ', 'Figs. ', ' ca. ', 'approx. ', '(ca. ', 'etc.) ']
    for tr in to_replace:
        data = data.replace(tr, tr[:-2]+'####@')
    tmp = nltk.sent_tokenize(data)
    for i, t in enumerate(tmp):
        for tr in to_replace:
            t = t.replace(tr[:-2]+'####@', tr)
        tmp[i] = t
    return tmp

In [20]:
from chemdataextractor import Document

def process_text(text, text_processor, pairs, max_abbs):
    single_dict = {}        
    text = normalize(text, pairs)                
    ab_sens = sen_seg(text)
    tup_0 = []
    pro_sens = []
    for s in ab_sens:
        processed, _ = text_processor.process(s)
        pro_sens.append(processed)
        for tup in _:
            # tup[0] is ori, tup[1] is normalized
            if tup[0] not in tup_0:
                tup_0.append(tup[0])
            if tup[1] not in single_dict.keys():
                single_dict[tup[1]] = {}
                single_dict[tup[1]]['num'] = 1
                single_dict[tup[1]]['ori'] = [tup[0]]
            else:
                single_dict[tup[1]]['num'] += 1 
                if tup[0] not in single_dict[tup[1]]['ori']:
                    single_dict[tup[1]]['ori'].append(tup[0])
    for b in max_abbs:
        if ' '+b+' ' in text and b not in tup_0:
            abb_num = text.count(' '+b+' ')
            single_dict[b] = {}
            single_dict[b]['num'] = abb_num
        
    return pro_sens, single_dict      
    

In [22]:
from mat2vec.processing import MaterialsTextProcessor
import json
from tqdm import tqdm

with open('pair_list.json', 'r', encoding='utf-8') as f:
    pairs = json.load(f)
print(len(pairs))
# 4251 nomalization of abbrev
# arrange normalization by length (long first)
    
# based dictionary of materials
max_abbs = []
for p in pairs:
    max_abbs.append(p['max_abb'])

# collect sentences, make mat dict
sentences = []
mat_dict = {}
ori_dict = {}
text_processor = MaterialsTextProcessor()
judge_words = ['photovol', 'solar']
for i in tqdm(xml_dict.keys()):
    flag = 'N'
    if 'abstract' not in xml_dict[i]:
        judge = xml_dict[i]['title']
    else:
        judge = xml_dict[i]['abstract']
    for j in judge_words:
        if j in judge:
            flag = 'Y'
    texts = (' ').join(xml_dict[i]['paras'])
    texts += ' '+xml_dict[i]['title']
    sens, single_dict = process_text(texts, text_processor, pairs, max_abbs)
    sentences.extend(sens)
    for s in single_dict.keys():
        if s in mat_dict.keys():
            if flag in mat_dict[s]:
                mat_dict[s][flag] += single_dict[s]['num']
            else:
                mat_dict[s][flag] = single_dict[s]['num']
        else:
            mat_dict[s]={}
            mat_dict[s][flag] = single_dict[s]['num']
    for s in single_dict.keys():
        if 'ori' in single_dict[s].keys():
            if s not in ori_dict.keys():
                ori_dict[s] = single_dict[s]['ori']
            else:
                for o in single_dict[s]['ori']:
                    if o not in ori_dict[s]:
                        ori_dict[s].append(o)
            
for i in tqdm(abstract_dict.keys()):
    flag = 'N'
    texts = abstract_dict[i]
    for j in judge_words:
        if j in texts:
            flag = 'Y'
    sens, single_dict = process_text(texts, text_processor, pairs, max_abbs)
    sentences.extend(sens)
    for s in single_dict.keys():
        if s in mat_dict.keys():
            if flag in mat_dict[s]:
                mat_dict[s][flag] += single_dict[s]['num']
            else:
                mat_dict[s][flag] = single_dict[s]['num']
        else:
            mat_dict[s]={}
            mat_dict[s][flag] = single_dict[s]['num']
    for s in single_dict.keys():
        if 'ori' in single_dict[s].keys():
            if s not in ori_dict.keys():
                ori_dict[s] = single_dict[s]['ori']
            else:
                for o in single_dict[s]['ori']:
                    if o not in ori_dict[s]:
                        ori_dict[s].append(o)

4249


100%|██████████| 10320/10320 [1:25:21<00:00,  2.02it/s]
100%|██████████| 86614/86614 [36:04<00:00, 40.02it/s]  


In [23]:
json_str = json.dumps(sentences, indent=4)
with open('sentences.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_str)
    
json_str = json.dumps(ori_dict, indent=4)
with open('ori_dict.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_str)
    
json_str = json.dumps(mat_dict, indent=4)
with open('mat_dict.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_str)

## Different configuration

In [None]:
import json

with open('sentences.json', 'r', encoding='utf-8') as f:
    sentences = json.load(f)
with open('mat_dict.json', 'r', encoding='utf-8') as f:
    mat_dict = json.load(f)

In [28]:
from gensim.models import Word2Vec

# , sg=1
model = Word2Vec(sentences, vector_size=200, window=8, min_count=4, workers=4, sg=1)
model.save("skip_vec/word2vec_our.model")

In [24]:
from gensim.models import Word2Vec

cbow_model = Word2Vec.load('cbow_vec/word2vec_our.model')

In [25]:
word_vectors = cbow_model.wv
print(len(word_vectors.key_to_index))
similarity = word_vectors.similarity('conduction', 'valence')
print(similarity)
similarity = word_vectors.similarity('PV', 'solar')
print(similarity)
similarity = word_vectors.similarity('PV', 'photovoltaics')
print(similarity)
similarity = word_vectors.similarity('contact', 'conduction')
print(similarity)
similarity = word_vectors.similarity('molecular', 'orbitals')
print(similarity)
similarity = word_vectors.similarity('PV', 'thermoelectric')
print(similarity)
similarity = word_vectors.similarity('conduction', 'deposition')
print(similarity)
similarity = word_vectors.similarity('water', 'piezoelectric')
print(similarity)

403285
0.6740291
0.67648166
0.5696485
0.25458235
0.09250669
0.5027627
0.05856296
-0.05344326


In [46]:
passivation_sims = word_vectors.most_similar('passivation', topn=403285)
contact_sims = word_vectors.most_similar('contact', topn=403285)
conductivity_sims = word_vectors.most_similar('conductivity', topn=403285)

passivation = word_vectors['passivation']
contact = word_vectors['contact']
conductivity = word_vectors['conductivity']
combine_vec = (passivation + contact) / 2
combine_sims = word_vectors.most_similar(positive=[combine_vec], topn=403285)
combine_vec3 = (passivation + contact + conductivity) / 3
combine_sims3 = word_vectors.most_similar(positive=[combine_vec3], topn=403285)

In [48]:
import csv

with open('skip_cbm3.csv', 'w', encoding='utf-8',newline='') as csvfile:
    f = csv.writer(csvfile)
    f.writerow(['num','rank','name','freq_N', 'freq_Y'])
    count = 1
    for i, p_set in enumerate(combine_sims3):
        p = p_set[0]
        if p in mat_dict.keys(): 
            if 'N' in mat_dict[p].keys():
                freq_N = mat_dict[p]['N']
            else:
                freq_N = 0
            if 'Y' in mat_dict[p].keys():
                freq_Y = mat_dict[p]['Y']
            else:
                freq_Y = 0
            if freq_N + freq_Y >= 3:
                f.writerow([str(count),str(i),p,str(freq_N), str(freq_Y)])
                count += 1