In [1]:
import pandas as pd 
import operations as op
import specific_operations as sop
import yaml

OUTPUT_FOLDER = './output'

JCS_FILEPATH = "./../data/jcs_2020.csv"
SCIMAGO_FILEPATH = "./../data/scimagojr 2020.csv"
ACM_JSON_FILEPATH = "./../data/acm.json"
IEE_JSON_FILEPATH = "./../data/iee.json"
SCIENCE_DIRECT_JSON_FILEPATH = "./../data/science_direct.json"


JCS_DF = pd.read_csv(JCS_FILEPATH, sep=";", low_memory=False).convert_dtypes()
SCIMAGO_DF = pd.read_csv(SCIMAGO_FILEPATH, sep=";", low_memory=False).convert_dtypes()
ACM_DF: pd.DataFrame = pd.read_json(ACM_JSON_FILEPATH).convert_dtypes()
IEE_DF: pd.DataFrame = pd.read_json(IEE_JSON_FILEPATH).convert_dtypes()
SCIENCE_DIRECT_DF: pd.DataFrame = pd.read_json(SCIENCE_DIRECT_JSON_FILEPATH).convert_dtypes()


In [3]:
def get_config():
      try:
            return yaml.load(open("./config.yaml"), yaml.loader.SafeLoader)
      except:
            print("Não foi possível ler o arquivo config.yaml")

CONFIG = get_config()

scimago_df = (SCIMAGO_DF
              .pipe(op.map, ['Title', 'Total Cites (3years)'])
              .pipe(op.rename_cols, columns = {
                    'Total Cites (3years)': 'Total Cites Scimago'
              })
              .pipe(op.apply_to_every_row, 'Upper Title', lambda r: r["Title"].upper().strip())
              .pipe(op.convert_type, 'Total Cites Scimago', 'float64'))


jcs_df = (JCS_DF
          .pipe(op.map, ["Full Journal Title", "Total Cites"])
          .pipe(op.rename_cols, columns = {
                "Full Journal Title": "Title",
                "Total Cites": "Total Cites JCS"
          })
          .pipe(op.apply_to_every_row, 'Upper Title', lambda r: r["Title"].upper().strip())
          .pipe(op.convert_type, "Total Cites JCS", 'str')
          .pipe(op.apply_to_every_row, "Total Cites JCS", lambda r: r["Total Cites JCS"].replace(',', ''))
          .pipe(op.convert_type, "Total Cites JCS", 'float64'))
          

cites_df = (scimago_df
               .pipe(op.join, df_right = jcs_df, left_on =["Upper Title"], right_on =["Upper Title"], how = 'outer')
               .pipe(op.apply_to_every_row, 'Title', lambda r: r['Title_x'] if pd.isnull(r['Title_y']) else r['Title_y'])
               .pipe(op.map, ["Title", 'Upper Title','Total Cites JCS', 'Total Cites Scimago'])
               .pipe(op.distinct)
               #.pipe(op.create_column, "year", "int", 2020) # VERIFICAR COM O PROFESSOR SE ESTA CORRETO
               .pipe(op.rename_cols, {
                  "Title": "title",
                  "Upper Title": "upper_title",
                  "Total Cites JCS": "jcs_value",
                  "Total Cites Scimago": "scimago_value"
               }))

bibtexs_df = (pd.concat([SCIENCE_DIRECT_DF, ACM_DF, IEE_DF])
             .pipe(op.distinct)
             .pipe(op.apply_to_every_row, 'upper_title', lambda r: r["title"].upper().strip())
             .pipe(op.join, df_right=cites_df, left_on=["upper_title"], right_on=["upper_title"], how="left")
             #.pipe(op.query, "not title_y.isnull()") #Retorna regisros que deram match no join entre os arquivos bibtex e csv
             .pipe(op.map, ["author", "title_x", "keywords", "abstract", "year", "type_publication", "doi", "jcs_value", "scimago_value"])
             .pipe(op.rename_cols, {
                  "title_x" : "title"
             })
             .pipe(sop.filter_column_by_regex, CONFIG)
             .pipe(sop.export_dataframe, CONFIG, OUTPUT_FOLDER))


bibtexs_df


Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi,jcs_value,scimago_value
0,Jorge Merino and Ismael Caballero and Bibiano ...,A Data Quality in Use model for Big Data,"Data Quality, Big Data, Measurement, Quality-i...","Beyond the hype of Big Data, something within ...",2016,article,https://doi.org/10.1016/j.future.2015.11.024,,
1,Simon Vydra and Bram Klievink,Techno-optimism and policy-pessimism in the pu...,"Big data, Analytics, Government, Public admini...","Despite great potential, high hopes and big pr...",2019,article,https://doi.org/10.1016/j.giq.2019.05.010,,
2,Yuri A.W. Shardt and Xu Yang and Kevin Brooks ...,Data Quality Assessment for System Identificat...,"data quality assessment, system identification...",As the amount of data stored from industrial p...,2020,article,https://doi.org/10.1016/j.ifacol.2020.12.103,,
3,Jia Liu and Tianrui Li and Peng Xie and Shengd...,Urban big data fusion based on deep learning: ...,"Urban computing, Big data, Data fusion, Deep l...",Urban big data fusion creates huge values for ...,2020,article,https://doi.org/10.1016/j.inffus.2019.06.016,,
4,Gianluca Elia and Gloria Polimeno and Gianluca...,A multi-dimension framework for value creation...,"Big Data analytics, Cognitive computing, Frame...",Big Data represents a promising area for value...,2020,article,https://doi.org/10.1016/j.indmarman.2020.03.015,,
...,...,...,...,...,...,...,...,...,...
245,"Lincy, S.S. Blessy Trencia and Kumar, N. Suresh",An enhanced pre-processing model for big data ...,Big Data;Data models;Algorithm design and anal...,With the ever growing trends and technologies ...,2017,inproceedings,10.1109/IGEHT.2017.8094109,,
246,"Wang, Wenjing and Yang, Shengquan",Research on Air Quality Forecasting Based on B...,Air quality;Predictive models;Atmospheric mode...,Aiming at the problem that existing air qualit...,2020,inproceedings,10.1109/ICCNEA50255.2020.00045,,
247,"Wang, Songyun and Yuan, Jiabin and Li, Xin and...",Active Data Replica Recovery for Quality-Assur...,Nonvolatile memory;Quality of service;Data ana...,QoS-aware big data analysis is critical in Inf...,2019,article,10.1109/ACCESS.2019.2932259,,
248,"Huimin, Li and Guomin, Song",Research on the Teaching Reform of Finance and...,Training;Technological innovation;Cloud comput...,With the development of information and intell...,2020,inproceedings,10.1109/ICBASE51474.2020.00023,,
