In [11]:
import os
import bibtexparser 
import pandas as pd
import numpy as np
import yaml

In [8]:
# parametrsos globais
entrada = "input/"
saida = "output/"

# criando um lista com as colunas do escopo do projeto
col_types = [["author"],["title"],["keywords"],["abstract"],["year"],
                 ["type_publication","ENTRYTYPE"],["doi"]]

In [None]:
# carregando os arquivos da pasta entrada
dir_list = os.listdir(entrada)
files = []
for file in dir_list:
    
    if '.BIB' not in file.upper():
        continue
        
    files.append(file)

In [9]:
## fazer a leitura de todos os arquivos
# criando um DF em branco para armazenar todos os arquivos
df_final = pd.DataFrame()

for file in files:
    
    ################################################################
    # iniciando a leitura do arquivo
    with open(entrada + file, encoding = "utf_8" ) as bibtex_file:
        
        bib_database = bibtexparser.load(bibtex_file)
        
        # convertendo o conteudo do arquivo para DF
        df = pd.DataFrame(bib_database.entries)
    ###############################################

    ################################################################################
    ## padronizando colunas
    # criando DF em branco para receber as colunas selecionadas do arquivo da vez
    new_df = pd.DataFrame()

    # interar lista de todos os tipos de colunas
    for col_type in col_types:

        # interar uma lista de um tipo de coluna apenas
        for col in col_type:

            # verificar se um item do tipo existe no DF
            if col in df.columns:
                # se existir criar um nova coluna no new_DF com o rótulo do primeiro item da lista do tipo da coluna
                new_df[col_type[0]] = df[col]
                break
            else:
                # se a coluna não existir então verificar se é a última coluna da lista do tipo
                # isso significa que não existe a coluna no DF, então criar um coluna com o rótulo do primeiro item 
                # com conteúdo em branco
                if col == col_type[-1]:
                    new_df[col_type[0]] = np.nan

    new_df["file"] = file
    ############################################
        
    # concatenando df 
    df_final = pd.concat([df_final, new_df]) 



In [10]:
# carregando a configuração do formato para exportação
with open('config/config.yaml', 'r') as config:
    formato = yaml.load(config, Loader=yaml.FullLoader)["export_to"]

Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi,file
0,Jorge Merino and Ismael Caballero and Bibiano ...,A Data Quality in Use model for Big Data,"Data Quality, Big Data, Measurement, Quality-i...","Beyond the hype of Big Data, something within ...",2016,article,https://doi.org/10.1016/j.future.2015.11.024,sdFull.bib
1,Yuri A.W. Shardt and Xu Yang and Kevin Brooks ...,Data Quality Assessment for System Identificat...,"data quality assessment, system identification...",As the amount of data stored from industrial p...,2020,article,https://doi.org/10.1016/j.ifacol.2020.12.103,sdFull.bib
2,Victor O.K. Li and Jacqueline C.K. Lam and Yan...,A Big Data and Artificial Intelligence Framewo...,"Air Pollution Monitoring, Health Management, A...",All people in the world are entitled to enjoy ...,2021,article,https://doi.org/10.1016/j.envsci.2021.06.011,sdFull.bib
3,Yi-Chia Lee and Ying-Ting Chao and Pei-Ju Lin ...,Quality assurance of integrative big data for ...,"Big data, Electronic health record, Evidence b...",Background\nThe need is growing to create medi...,2022,article,https://doi.org/10.1016/j.jfma.2021.12.024,sdFull.bib
4,Conor John Cremin and Sabyasachi Dash and Xiao...,Big data: Historic advances and emerging trend...,"Big data, Big data in biomedicine, Data analyt...",Big data is transforming biomedical research b...,2022,article,https://doi.org/10.1016/j.crbiot.2022.02.004,sdFull.bib
...,...,...,...,...,...,...,...,...
95,"Rathore, Purva and Shukla, Deepak",Analysis and performance improvement of K-mean...,Image recognition;Data visualization;Breast;Im...,The big data environment is used to support th...,2015,inproceedings,10.1109/ICCN.2015.9,ieee_full.bib
96,"Qi, Cui and Mingyue, Sun and Na, Mi and Hongga...",Regional Electricity Sales Forecasting Researc...,Recurrent neural networks;Power supplies;Time ...,Regional monthly electricity sales forecast is...,2020,inproceedings,10.1109/ICECE51594.2020.9352886,ieee_full.bib
97,"Adnan, Kiran and Akbar, Rehan and Wang, Khor Siak",Towards Improved Data Analytics Through Usabil...,Bridges;Analytical models;Data analysis;Data i...,A high volume of unstructured data is being ge...,2021,inproceedings,10.1109/ICCOINS49721.2021.9497187,ieee_full.bib
98,"Serhani, Mohamed Adel and El Kassabi, Hadeel T...",Quality Profile-Based Cloud Service Selection ...,Big Data;Quality of service;Task analysis;Clou...,Big data has emerged as promising technology t...,2017,inproceedings,10.1109/SC2.2017.30,ieee_full.bib
