In [2]:
import pandas as pd 
import operations as op
import specific_operations as sop
import yaml

OUTPUT_FOLDER = './output'

JCS_FILEPATH = "./../data/jcs_2020.csv"
SCIMAGO_FILEPATH = "./../data/scimagojr 2020.csv"
ACM_JSON_FILEPATH = "./../data/acm.json"
IEE_JSON_FILEPATH = "./../data/iee.json"
SCIENCE_DIRECT_JSON_FILEPATH = "./../data/science_direct.json"


JCS_DF = pd.read_csv(JCS_FILEPATH, sep=";", low_memory=False).convert_dtypes()
SCIMAGO_DF = pd.read_csv(SCIMAGO_FILEPATH, sep=";", low_memory=False).convert_dtypes()
ACM_DF: pd.DataFrame = pd.read_json(ACM_JSON_FILEPATH).convert_dtypes()
IEE_DF: pd.DataFrame = pd.read_json(IEE_JSON_FILEPATH).convert_dtypes()
SCIENCE_DIRECT_DF: pd.DataFrame = pd.read_json(SCIENCE_DIRECT_JSON_FILEPATH).convert_dtypes()


In [10]:
"""TODO
      - Verificar com o professor a veracidade do join usando o campo "year" setado como 2020 para o df cites_df
"""

def get_config():
      try:
            return yaml.load(open("./config.yaml"), yaml.loader.SafeLoader)
      except:
            print("Não foi possível ler o arquivo config.yaml")

CONFIG = get_config()

scimago_df = (SCIMAGO_DF
              .pipe(op.map, ['Title', 'Total Cites (3years)'])
              .pipe(op.rename_cols, columns = {
                    'Total Cites (3years)': 'Total Cites Scimago'
              })
              .pipe(op.apply_to_every_row, 'Upper Title', lambda r: r["Title"].upper().strip())
              .pipe(op.convert_type, 'Total Cites Scimago', 'float64'))


jcs_df = (JCS_DF
          .pipe(op.map, ["Full Journal Title", "Total Cites"])
          .pipe(op.rename_cols, columns = {
                "Full Journal Title": "Title",
                "Total Cites": "Total Cites JCS"
          })
          .pipe(op.apply_to_every_row, 'Upper Title', lambda r: r["Title"].upper().strip())
          .pipe(op.convert_type, "Total Cites JCS", 'str')
          .pipe(op.apply_to_every_row, "Total Cites JCS", lambda r: r["Total Cites JCS"].replace(',', ''))
          .pipe(op.convert_type, "Total Cites JCS", 'float64'))
          

cites_df = (scimago_df
               .pipe(op.join, df_right = jcs_df, left_on =["Upper Title"], right_on =["Upper Title"], how = 'outer')
               .pipe(op.apply_to_every_row, 'Title', lambda r: r['Title_x'] if pd.isnull(r['Title_y']) else r['Title_y'])
               .pipe(op.map, ["Title", 'Upper Title','Total Cites JCS', 'Total Cites Scimago'])
               .pipe(op.distinct)
               .pipe(op.create_column, "year", "int", 2020) # VERIFICAR COM O PROFESSOR SE ESTA CORRETO
               .pipe(op.rename_cols, {
                  "Title": "title",
                  "Upper Title": "upper_title",
                  "Total Cites JCS": "jcs_value",
                  "Total Cites Scimago": "scimago_value"
               }))

bibtexs_df = (pd.concat([SCIENCE_DIRECT_DF, ACM_DF, IEE_DF])
             .pipe(op.distinct)
             .pipe(op.apply_to_every_row, 'upper_title', lambda r: r["title"].upper().strip())
             .pipe(op.join, df_right=cites_df, left_on=["upper_title", "year"], right_on=["upper_title", "year"], how="left")
             #.pipe(op.query, "not title_y.isnull()") #Retorna regisros que deram match no join entre os arquivos bibtex e csv
             .pipe(op.map, ["author", "title_x", "keywords", "abstract", "year", "type_publication", "doi", "jcs_value", "scimago_value"])
             .pipe(op.rename_cols, {
                  "title_x" : "title"
             })
             .pipe(sop.filter_column_by_regex, CONFIG)
             .pipe(sop.export_dataframe, CONFIG, OUTPUT_FOLDER))


bibtexs_df


Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi,jcs_value,scimago_value
74,Mark Birkin,Big Data,"Administrative data, Crowdsourcing, Data ethic...",Big Data are deeply impactful for research in ...,2020,incollection,https://doi.org/10.1016/B978-0-08-102295-5.106...,606.0,555.0
