In [None]:
#COMMIT DA AULA 3

import pandas as pd 
from modules.operations import Operations as op
from modules.specific_operations import SpecificOperations as sop
from modules.config import Config as config
from modules.bibtex_reader import BibtexReader 
from modules.folder_reader import FolderReader

OUTPUT_FOLDER = './../output'
CONFIG_FILEPATH = './config.yaml'

JCS_INPUT_FILEPATH = "./../../data/jcs_2020.csv"
SCIMAGO_INPUT_FILEPATH = "./../../data/scimagojr 2020.csv"

ACM_INPUT_FOLDER = "./../../data/acm"
IEEE_INPUT_FOLDER = "./../../data/ieee"
SCIENCE_INPUT_FOLDER = "./../../data/science_direct"

CONFIG = config.get_config(CONFIG_FILEPATH)

# Extração

In [None]:
DF_ACM = BibtexReader.read_files_to_dataframe(
    FolderReader.get_filepaths_from_folder(ACM_INPUT_FOLDER)
)

DF_IEEE = BibtexReader.read_files_to_dataframe(
    FolderReader.get_filepaths_from_folder(IEEE_INPUT_FOLDER)
)

DF_SD = BibtexReader.read_files_to_dataframe(
    FolderReader.get_filepaths_from_folder(SCIENCE_INPUT_FOLDER)
)

DF_JCS = pd.read_csv(JCS_INPUT_FILEPATH, sep=";")

DF_SCIMAGO = pd.read_csv(SCIMAGO_INPUT_FILEPATH, sep=";", low_memory=False)

# Transformação - Tratamento dos bibtex

In [None]:
df_acm = (DF_ACM
          .convert_dtypes()
          .pipe(op.map, ["author", "title", "keywords", "abstract", "year", "ENTRYTYPE", "doi", "issn", "isbn", "journal"])
          .pipe(op.create_column, "source", str, "acm")
          .pipe(op.rename_cols, {
                "ENTRYTYPE": "type_publication"
          }))

df_ieee = (DF_IEEE
          .convert_dtypes()  
          .pipe(op.map, ["author", "title", "keywords", "abstract", "year", "ENTRYTYPE", "doi", "issn", "journal"])
          .pipe(op.create_column, "isbn", str, pd.NA)
          .pipe(op.create_column, "source", str, "ieee")
          .pipe(op.rename_cols, {
                "ENTRYTYPE": "type_publication"
          })
          .pipe(op.apply_to_every_row, function = lambda r: pd.NA if pd.isnull(r.issn) else r.issn.replace("-", ""),
                                       output_col = 'issn'))

df_sd = (DF_SD
          .convert_dtypes()  
          .pipe(op.map, ["author", "title", "keywords", "abstract", "year", "ENTRYTYPE", "doi", "issn", "isbn", "journal"])
          .pipe(op.create_column, "source", str, "science direct")
          .pipe(op.rename_cols, {
                "ENTRYTYPE": "type_publication"})
          .pipe(op.apply_to_every_row, function = lambda r: pd.NA if pd.isnull(r.issn) else r.issn.replace("-", ""),
                                         output_col = 'issn'))

# Transformação - JCS e SCIMAGO

In [None]:
# União de todos os arquivos bibtex em um só Dataframe
df_bibtex = (pd.concat([df_acm, df_ieee, df_sd])
            .convert_dtypes()
            .drop_duplicates()
            .pipe(op.apply_to_every_row, function = lambda r: pd.NA if pd.isnull(r.journal) else r.journal.upper().strip(),
                                         output_col = 'upper_journal'))



# União dos arquivos scimago e jcr em um só Dataframe
df_scimago = (DF_SCIMAGO
                  .convert_dtypes()
                  .pipe(op.map, cols = ['Issn', 'Title', 'SJR'])
                  .pipe(op.rename_cols, columns = {
                        'SJR': 'scimago_value',
                        'Issn': 'issn',
                        'Title': 'title'
                  })
                  .pipe(op.apply_to_every_row, function = lambda r: r.title.upper().strip(), 
                                           output_col = 'upper_title')
                  .pipe(op.convert_type, col = 'scimago_value', type = str)
                  .pipe(op.apply_to_every_row,  function = lambda r: r.scimago_value.replace(",", "."),
                                                output_col = 'scimago_value'))

df_jcs = (DF_JCS
            .convert_dtypes()  
            .pipe(op.map, cols = ["Full Journal Title", "Journal Impact Factor"])
            .pipe(op.rename_cols, columns = {
                  "Full Journal Title": "title",
                  "Journal Impact Factor": "jcs_value"
            })
            .pipe(op.apply_to_every_row, function = lambda r: r["title"].upper().strip(),
                                       output_col = 'upper_title' )
)

df_journal = (df_scimago
                  .pipe(op.join, df_right = df_jcs, 
                                 left_on = ["upper_title"],
                                 right_on = ["upper_title"],
                                 how = "outer") 
                  
                  .pipe(op.map, cols = ['issn', 'upper_title', 'scimago_value', 'jcs_value']) 
                  .pipe(op.rename_cols, columns = {
                        'issn': 'issn_journal'
                  })
                  .pipe(op.apply_to_every_row,  function = lambda r: '' if str(r.issn_journal) == '-' or pd.isnull(r.issn_journal) else r.issn_journal,
                                                output_col = 'issn_journal'))

df_regex_groups = df_journal['issn_journal'].str.split(pat=",", n=3, expand=True)
df_journal['issn_1'] = df_regex_groups.loc[:, 0]
df_journal['issn_2'] = df_regex_groups.loc[:, 1]
df_journal['issn_3'] = df_regex_groups.loc[:, 2]

df_journal = (df_journal
                  .pipe(op.apply_to_every_row,  function = lambda r: pd.NA if r.issn_1 == '' or pd.isnull(r.issn_1)  else r.issn_1,
                                                output_col = 'issn_1')
                  .pipe(op.apply_to_every_row,  function = lambda r: pd.NA if r.issn_2 == '' or pd.isnull(r.issn_2)  else r.issn_2,
                                                output_col = 'issn_2')
                  .pipe(op.apply_to_every_row,  function = lambda r: pd.NA if r.issn_3 == '' or pd.isnull(r.issn_3)  else r.issn_3,
                                                output_col = 'issn_3'))
# Joins entre os bibtex e jcr/scimago

df_subset_1 = (df_bibtex
                  .pipe(op.join, df_right = df_journal[~pd.isnull(df_journal['issn_1'])],
                                    left_on  = ['issn'],
                                    right_on = ['issn_1'],
                                    how      = 'left')
                  .pipe(op.map, cols = ['author', 'title', 'keywords', 'abstract', 'year', 'type_publication', 'doi', 'issn', 
                                          'journal', 'source', 'scimago_value', 'jcs_value', 'upper_journal']))

df_subset_2 = (df_subset_1
                  .query("scimago_value.isnull() & jcs_value.isnull()")
                  .pipe(op.join, df_right = df_journal[~pd.isnull(df_journal['issn_2'])],
                                    left_on  = ['issn'],
                                    right_on = ['issn_2'],
                                    how      = 'left')
                  .pipe(op.map, cols = ['author', 'title', 'keywords', 'abstract', 'year', 'type_publication', 'doi', 'issn', 
                                          'journal', 'source', 'scimago_value_y', 'jcs_value_y', 'upper_journal'])
                  .pipe(op.rename_cols, columns = {
                        'scimago_value_y': 'scimago_value',
                        'jcs_value_y': 'jcs_value'
                        }))

df_subset_3 = (df_subset_2
                  .query("scimago_value.isnull() & jcs_value.isnull()")
                  .pipe(op.join, df_right = df_journal[~pd.isnull(df_journal['issn_3'])],
                                    left_on  = ['issn'],
                                    right_on = ['issn_3'],
                                    how      = 'left')
                  .pipe(op.map, cols = ['author', 'title', 'keywords', 'abstract', 'year', 'type_publication', 'doi', 'issn', 
                                          'journal', 'source', 'scimago_value_y', 'jcs_value_y', 'upper_journal'])
                  .pipe(op.rename_cols, columns = {
                        'scimago_value_y': 'scimago_value',
                        'jcs_value_y': 'jcs_value'
                        }))

df_subset_4 = (df_subset_3
                  .query("scimago_value.isnull() & jcs_value.isnull()")
                  .pipe(op.join, df_right = df_journal[~pd.isnull(df_journal.upper_title)],
                              left_on = ['upper_journal'],
                              right_on = ['upper_title'],
                              how = 'left')
                  .pipe(op.map, cols = ['author', 'title', 'keywords', 'abstract', 'year', 'type_publication', 'doi', 'issn', 
                                    'journal', 'source', 'scimago_value_y', 'jcs_value_y', 'upper_journal'])
                  .pipe(op.rename_cols, columns = {
                        'scimago_value_y': 'scimago_value',
                        'jcs_value_y': 'jcs_value'
                  }))

df_final = (pd.concat([df_subset_1.query("~(scimago_value.isnull() & jcs_value.isnull())"), 
                        df_subset_2.query("~(scimago_value.isnull() & jcs_value.isnull())"), 
                        df_subset_3.query("~(scimago_value.isnull() & jcs_value.isnull())"),
                        df_subset_4])
            .drop_duplicates()
            .reset_index(drop = True)
            .pipe(op.map, cols = ['author','title','keywords','abstract','year','type_publication','doi','issn','journal','source','scimago_value','jcs_value']))


# Carga dos dados
(df_final
      .pipe(sop.filter_column_by_regex, CONFIG)
      .pipe(sop.export_dataframe, CONFIG, OUTPUT_FOLDER))

In [None]:
df_final.query("~(scimago_value.isnull() & jcs_value.isnull())")
df_final