In [None]:
import pandas as pd
use = ['PT', 'AU', 'TI', 'SO', 'J9', 'DT', 'DE', 'ID', 'AB','CR','NCR','TC','BP','EP','PY','DI','UT','LCS','LCR']
dtfm = pd.read_csv(
    'Data/20210825_LCS100+allBIYC.csv',
    header=0, index_col=False, usecols=use, keep_default_na=False
)

In [None]:
# customized extraction from WoS txt to lst
import re

def extract_infos_by_readlines(f):
    f.seek(0)
    lst_AU, lst_J9, lst_CR, lst_PY, lst_SO, lst_DI = [], [], [], [], [], []
    count_SO, count_J9 = 0, 1
    for line in f:
        imlst = []
        if re.match(r'AU ', line):
            imlst.append(line[3:-1])
            line = f.readline()
            while re.match(r'   ', line):
                imlst.append(line[3:-1])
                line = f.readline() # when match==False,one more ln will be read
                # discard ths ln bcs not interested
            lst_AU.append(imlst)
        elif re.match(r'CR ', line):
            imlst.append(line[3:-1])
            line = f.readline()
            while re.match(r'   ', line):
                imlst.append(line[3:-1])
                line = f.readline() # here also discard th ln after
            lst_CR.append(imlst)
        elif re.match(r'J9 ', line):
            if not count_J9 == count_SO: # Notify count_J9 initiate from 1, count_SO form 0
                # bcs in the text, ln SO is below the ln J9
                # when two are not equal, J9 must be absent in the last paper
                lst_J9.append([count_J9, None])
                count_J9 += 1
            lst_J9.append([count_J9, line[3:-1]])
            count_J9 += 1
        elif re.match(r'PY ', line):
            lst_PY.append(line[3:-1])
        elif re.match(r'SO ', line):
            lst_SO.append(line[3:-1])
            count_SO += 1
        elif re.match(r'DI ', line):
            lst_DI.append(line[3:-1])
    return lst_AU, lst_J9, lst_CR, lst_PY, lst_SO, lst_DI

def extract_infos_by_re_search(f):
    f.seek(0)
    text = f.read()
    papers = re.split(r'\nAU ', string=text)[1:] # discard the first split bcs its useless info
    lst_titles = []
    for i in range(len(papers)):
        # papers[i] = 'AU ' + papers[i] # op .split deleted 'AU ', here we plus it back
        au = re.search(r'^.*\nAF', papers[i], flags=re.S).group(0)[:-3]
        jn = re.search(r'\nSO .*?\n.*?\n', papers[i]).group(0)[1:-1]
        cr = re.search(r'CR .*NR', papers[i], flags=re.S).group(0)[:-3]
        py = re.search(r'\nPY .*', papers[i]).group(0)[1:]
        di = re.search(r'\nDI .*', papers[i]).group(0)[1:]
        lst_titles.append([au, jn, cr, py, di])
    return lst_titles


f = open('Data\CiteAnalysisData\Della_2018.txt', 'rt', encoding='UTF_8', newline=None)

lst_titles = extract_infos_by_re_search(f)
lst_AU, lst_J9, lst_CR, lst_PY, lst_SO, lst_DI = extract_infos_by_readlines(f)

In [1]:
def formating_from_csv_to_md(csv_path: str, use_cols: list, templ:str):
    '''
    Usage: extract inforamtion from csv and write to markdown according with the given template;

    :param use_cols: 
        should include all information that you intend to show in the markdown;
        should always include 'J9', 'CR' and put them in the end of list;
    
    Notes: Output path is assigned to: './Project/'
    '''

    import re
    import pandas as pd

    dtfm = pd.read_csv(
        csv_path,
        header=0, index_col=False, usecols=use_cols, keep_default_na=False
    )

    lst_output = []
    for row in range(dtfm.shape[0]): # single for_loop for all process
        au = dtfm.loc[row]['AU'].split('; ')[0].replace(',', '')
        
        filename = au +', '+ str(dtfm.loc[row]['PY']) +', '+ dtfm.loc[row]['J9'] +'.md'
        
        templst_1 = []
        for each_CR in dtfm.loc[row]['CR'].split('; '):
            temp_CR = re.sub(r'[\[\]]+', repl='', string=each_CR)
            templst_2 = []
            for part in temp_CR.split(', '):
                if not re.match(r'^[PV][\d]*$|^DOI ', part): # ths ver trying to remove DOI in filename & content link
                    templst_2.append(part)
            templst_1.append('[[' + ', '.join(templst_2) + ']]')
        output_CR = '\n'.join(templst_1)
        
        filecontent = templ.format(
            dtfm.loc[row][use_cols[0]],
            dtfm.loc[row][use_cols[1]],
            dtfm.loc[row][use_cols[2]],
            dtfm.loc[row][use_cols[3]],
            dtfm.loc[row][use_cols[4]],
            dtfm.loc[row][use_cols[5]],
            dtfm.loc[row][use_cols[6]],
            dtfm.loc[row][use_cols[7]],
            dtfm.loc[row][use_cols[8]],
            dtfm.loc[row][use_cols[9]],
            output_CR
        )

        lst_output.append((filename, filecontent))
    
    # save files:
    for i in range(len(lst_output)):
        with open(f'./Projects/{lst_output[i][0]}', 'xt', encoding='UTF_8') as f:
            f.write(lst_output[i][1])
            f.close()

    return lst_output, dtfm





path = 'Data/20210825_LCS100+allBIYC.csv'
use = ['TI','AU','PY','DT','SO','AB','DE','ID','DI','UT','J9','CR']
template = '''---
Show:off
---
**Title**: {0}
**Authors**: {1}
**PubYear**: #PY{2}
**DocType**: {3}
**Journal**: {4}
> **Abstract**:
> {5}

**AuthKW**: {6}
**Keywords+**: {7}
**DOI**: {8}
**WoSNo**: {9}

#### CitedRefs:
{10}'''

cklst, dtfm = formating_from_csv_to_md(csv_path=path, use_cols=use, templ=template)

In [None]:
# prototype of formatting_from_csv...
import re

test_dtfm = dtfm.copy()


view_CR = test_dtfm['CR']
views = test_dtfm.loc[:][['AU','J9','PY']]

lst_filename = []
lst_CR = []
lst_filecontent = []

for row in range(test_dtfm.shape[0]):
    if not view_CR[row] == '':
        templst_1 = []
        for each_CR in view_CR[row].split('; '):
            temp_CR = re.sub(r'[\[\]]+', repl='', string=each_CR)
            templst_2 = []
            for part in each_CR.split(', '):
                if not re.match(r'^[PV][\d]*$|^DOI ', part):
                    templst_2.append(part)
            templst_1.append('[[' + ', '.join(templst_2) + ']]')
        lst_CR.append('\n'.join(templst_1))
    else:
        lst_CR.append(None) # substitute '' with None

    au = views.loc[row]['AU'].split('; ')[0].replace(',', '') + ', '
    lst_filename.append(au + str(views.loc[row]['PY']) +', '+ views.loc[row]['J9'] +'.md')

    lst_filecontent.append(
f'''
---
Show: off
title: {au + str(views.loc[row]['PY']) +', '+ views.loc[row]['J9']}
---
Title:: {dtfm.loc[row]['TI']}
Authors:: {dtfm.loc[row]['AU']}
PubYear:: #PY{dtfm.loc[row]['PY']}
Journal:: {dtfm.loc[row]['SO']}
DOI:: {dtfm.loc[row]['DI']}
Keywords_I:: {dtfm.loc[row]['DE']}
Keywords_II:: {dtfm.loc[row]['ID']}
Abstract:: {dtfm.loc[row]['AB']}
CitedRefs:
{lst_CR[row]}
'''
    )

In [None]:
# Old version: Formatting info lists
lst_resAU =[]
for item in lst_AU:
    lst_resAU.append(item[0].replace(',', '') + ', ')

lst_resJ9 = []
for item in lst_J9:
    if item[1]:
        lst_resJ9.append([item[0], item[1] + ', '])
    else:
        lst_resJ9.append([item[0], None])

for i in range(len(lst_resJ9)):
    if not lst_resJ9[i][1] and lst_SO[i]:
        lst_resJ9[i][1] = lst_SO[i] + ', '

lst_resPY = []
for item in lst_PY:
    lst_resPY.append(item + ', ')

lst_resDI = []
for item in lst_DI:
    valid_name = re.sub(repl='_', pattern=r'[\\\/:*?"<>|\s]+', string=item) # invalid: [\/:*?"<>|.] and space chr
    lst_resDI.append('DOI_' + valid_name)

'''
Formating for File Saving
'''
# split each Citation string for easily popping out unwanted parts
# re.match set the rule to pop out parts and check invalid characters
# finally join togather to single string with '\n'
lst_resCR = []
for row in lst_CR:
    imlst = []
    for cr in row:
        im2lst = []
        for part in cr.split(', '):
            if not re.match(r'^[PV][\d]*$', part):
                if re.match(r'^DOI ', part):
                    part = re.sub(repl='_', pattern=r'[\\\/:*?"<>|\s]+', string=part)
                im2lst.append(part)
        imlst.append('[[' + ', '.join(im2lst) + ']]')
    lst_resCR.append('\n'.join(imlst))
lst_filename = []
for i in range(len(lst_resAU)):
    lst_filename.append(lst_resAU[i] + lst_resPY[i] + lst_resJ9[i][1] + lst_resDI[i])

lst_file_AU = []
for item in lst_AU:
    lst_file_AU.append('\n'.join(item))

lst_filecontent = []
for i in range(len(lst_resAU)):
    lst_filecontent.append(
        'Auther:\n' + lst_file_AU[i] +
        '\nPublish Year:\n'+ lst_resPY[i] +
        '\nJournal:\n'+ lst_resJ9[i][1] +
        '\nDOI:\n'+ lst_resDI[i].replace('DOI ','') +
        '\nCitation:\n'+ lst_resCR[i]
    )

In [None]:
# save files:

for i in range(len(lst_filename)):
    with open(f'./Projects/{lst_filename[i]}', 'xt', encoding='UTF_8') as f:
        f.write(lst_filecontent[i])
        f.close()