In [1]:
#COMMIT DA AULA 3

import pandas as pd 
from modules.operations import Operations as op
from modules.specific_operations import SpecificOperations as sop
from modules.config import Config as config
from modules.bibtex_reader import BibtexReader 
from modules.folder_reader import FolderReader
from modules.sqlite3_loader import Sqlite3Handler
from modules.ieee_api_handler import IeeeApiHandler
from modules.science_direct_api_handler import ScienceDirectApiHandler

OUTPUT_FOLDER = './output'
CONFIG_FILEPATH = './config.yaml'

JCS_INPUT_FILEPATH = "./data/jcs_2020.csv"
SCIMAGO_INPUT_FILEPATH = "./data/scimagojr 2020.csv"

ACM_INPUT_FOLDER = "./data/acm"
IEEE_INPUT_FOLDER = "./data/ieee"
SCIENCE_INPUT_FOLDER = "./data/science_direct"

DB_FILEPATH = ':memory:'
OUTPUT_DB_FILEPATH = './output/db.sqlite3'

CONFIG = config.get_config(CONFIG_FILEPATH)

# Extração

In [8]:
DF_ACM = BibtexReader.read_files_to_dataframe(
    FolderReader.get_filepaths_from_folder(ACM_INPUT_FOLDER)
)

DF_IEEE = BibtexReader.read_files_to_dataframe(
    FolderReader.get_filepaths_from_folder(IEEE_INPUT_FOLDER)
)

DF_SD = BibtexReader.read_files_to_dataframe(
    FolderReader.get_filepaths_from_folder(SCIENCE_INPUT_FOLDER)
)

DF_JCS = pd.read_csv(JCS_INPUT_FILEPATH, sep=";")

DF_SCIMAGO = pd.read_csv(SCIMAGO_INPUT_FILEPATH, sep=";", low_memory=False)

# Transformação - Tratamento dos bibtex

In [11]:

################################ Tratamento dos bibtex ACM ################################

df_acm = DF_ACM.convert_dtypes()
df_acm = df_acm[["author", "title", "keywords", "abstract", "year", "ENTRYTYPE", "doi", "issn", "isbn", "journal"]]
df_acm = df_acm.rename(columns={"ENTRYTYPE": "type_publication"})
df_acm["issn"] = df_acm["issn"].mask(~df_acm["issn"].isnull(), df_acm["issn"].str.replace("-", ""))
df_acm["source"] = "acm"



################################ Tratamento dos bibtex IEEE ################################
df_ieee = DF_IEEE.convert_dtypes()
df_ieee = df_ieee[["author", "title", "keywords", "abstract", "year", "ENTRYTYPE", "doi", "issn", "journal"]]
df_ieee = df_ieee.rename(columns={"ENTRYTYPE": "type_publication"})
df_ieee["isbn"] = pd.NA
df_ieee["issn"] = df_ieee["issn"].mask(~df_ieee["issn"].isnull(), df_ieee["issn"].str.replace("-", ""))
df_ieee["source"] = "ieee"


################################ Tratamento dos bibtex Science Direct ################################
df_sd = DF_SD.convert_dtypes()
df_sd = df_sd[["author", "title", "keywords", "abstract", "year", "ENTRYTYPE", "doi", "issn", "isbn", "journal"]]
df_sd = df_sd.rename(columns={"ENTRYTYPE": "type_publication"})
df_sd["issn"] = df_sd["issn"].mask(~df_sd["issn"].isnull(), df_sd["issn"].str.replace("-", ""))
df_sd["source"] = "science direct"


# Transformação - JCS e SCIMAGO

In [14]:
################################ Tratamento de todos os bibtex ################################
df_bibtex = pd.concat([df_acm, df_ieee, df_sd])
df_bibtex = df_bibtex.convert_dtypes()
df_bibtex = df_bibtex.drop_duplicates()
df_bibtex["journal_title_key"] = df_bibtex["journal"].pipe(sop.generate_key_with_journal_name)  # type: ignore



################################ SCIMAGO ################################
df_scimago = DF_SCIMAGO.convert_dtypes()
df_scimago = df_scimago[['Issn', 'Title', 'SJR']]
df_scimago = df_scimago.rename(columns={'SJR': 'scimago_value', 'Issn': 'issn', 'Title': 'title'})
df_scimago["journal_title_key"] = df_scimago["title"].pipe(sop.generate_key_with_journal_name)  # type: ignore

df_scimago["scimago_value"] = df_scimago.scimago_value.astype("string")\
                                                      .str.replace(",", ".")


################################ JCS ################################
df_jcs = DF_JCS.convert_dtypes()
df_jcs = df_jcs[["Full Journal Title", "Journal Impact Factor"]]
df_jcs = df_jcs.rename(columns={"Full Journal Title": "title", "Journal Impact Factor": "jcs_value"})
df_jcs["journal_title_key"] = df_jcs["title"].pipe(sop.generate_key_with_journal_name)  # type: ignore


################################ JOIN ENTRE SCIMAGO E JCS -> df_journal ################################

df_journal = pd.merge(left=df_scimago, right=df_jcs, left_on=["journal_title_key"], right_on=["journal_title_key"], how="outer") 

df_journal["title"] = df_journal["title_x"].mask(pd.isnull(df_journal["title_x"]), df_journal["title_y"])

df_journal = df_journal[['title', "issn", "journal_title_key", "scimago_value", "jcs_value"]]
df_journal = df_journal.rename(columns={"issn": "issn_journal"})

df_journal["issn_journal"] = df_journal["issn_journal"].mask(pd.isnull(df_journal["issn_journal"]), '-')

df_journal["upper_title"] = df_journal["title"].str.upper().str.strip()

df_journal = df_journal.drop_duplicates()

df_regex_groups = df_journal['issn_journal'].str.split(pat=",", n=3, expand=True)
df_journal['issn_1'] = df_regex_groups.loc[:, 0]
df_journal['issn_2'] = df_regex_groups.loc[:, 1]
df_journal['issn_3'] = df_regex_groups.loc[:, 2]

df_journal['issn_1'] = df_journal['issn_1'].mask(df_journal["issn_1"] == '-')
df_journal['issn_2'] = df_journal['issn_2'].mask(df_journal["issn_2"] == '-')
df_journal['issn_3'] = df_journal['issn_3'].mask(df_journal["issn_3"] == '-')

df_journal = df_journal.drop_duplicates()

################################ JOIN ENTRE df_journal E df_bibtex no Banco de dados -> df_final ################################
CONNECTION = Sqlite3Handler.get_connection(DB_FILEPATH)
Sqlite3Handler.load_dataframe_to_db(df_bibtex, connection = CONNECTION, table_name = 'df_bibtex', if_exists = 'replace')
Sqlite3Handler.load_dataframe_to_db(df_journal, connection = CONNECTION, table_name = 'df_journal', if_exists = 'replace')
Sqlite3Handler.create_index(CONNECTION, "idx_df_bibtex_1", "df_bibtex", ["issn"])
Sqlite3Handler.create_index(CONNECTION, "idx_df_bibtex_2", "df_bibtex", ["journal_title_key"])
Sqlite3Handler.create_index(CONNECTION, "idx_df_journal_1", "df_journal", ["issn_1"])
Sqlite3Handler.create_index(CONNECTION, "idx_df_journal_2", "df_journal", ["issn_2"])
Sqlite3Handler.create_index(CONNECTION, "idx_df_journal_3", "df_journal", ["issn_3"])
Sqlite3Handler.create_index(CONNECTION, "idx_df_journal_4", "df_journal", ["journal_title_key"])

df_final = Sqlite3Handler.read_df_from_sql(CONNECTION, """--sql
      SELECT DISTINCT
            bib.author
            ,bib.title
            ,bib.keywords
            ,bib.abstract
            ,bib.year
            ,bib.type_publication
            ,bib.doi
            ,bib.issn
            ,COALESCE(bib.journal, jor.title) journal
            ,bib.source
            ,jor.scimago_value
            ,jor.jcs_value
      FROM 
            df_bibtex bib
            LEFT JOIN df_journal jor
                  ON (bib.issn = jor.issn_1
                  OR bib.issn = jor.issn_2
                  OR bib.issn = jor.issn_3
                  OR bib.journal_title_key = jor.journal_title_key)
                  AND NOT (bib.issn is null and bib.journal is null);
                  
""")

CONNECTION.close()

# Carga dos dados
OUTPUT_CONNECTION = Sqlite3Handler.get_connection(OUTPUT_DB_FILEPATH)

df_final = df_final.drop_duplicates(subset=["title", "year"])
df_final = df_final.pipe(sop.filter_column_by_regex, CONFIG)
df_final = df_final.pipe(Sqlite3Handler.load_dataframe_to_db, 
                              connection = OUTPUT_CONNECTION, 
                              table_name = 't_bibtex_extraidos_manualmente', 
                              if_exists = 'append')

OUTPUT_CONNECTION.close()

  .str.replace(r"([^A-Za-z0-9]+)", "")\
  .str.replace(r"([^A-Za-z0-9]+)", "")\
  .str.replace(r"([^A-Za-z0-9]+)", "")\


# Transformação - API's IEEE e Science Direct

In [None]:

science_direct_responses = ScienceDirectApiHandler.get_data(CONFIG.get("science_direct_api_config", {}))
sd_api_df = ScienceDirectApiHandler.parse_response_to_dataframe(science_direct_responses)

ieee_responses = IeeeApiHandler.get_data(CONFIG.get("iee_api_config", {}))
ieee_api_df = IeeeApiHandler.parse_response_to_dataframe(ieee_responses).drop_duplicates()

OUTPUT_CONNECTION = Sqlite3Handler.get_connection(OUTPUT_DB_FILEPATH)

Sqlite3Handler.load_dataframe_to_db(sd_api_df, OUTPUT_CONNECTION, "t_api_science_direct", "append")
Sqlite3Handler.load_dataframe_to_db(ieee_api_df, OUTPUT_CONNECTION, "t_api_ieee", "append")


Unnamed: 0,authors,title,keywords,abstract,year,type_publication,doi,issn,journal,source
0,"Ikbal Taleb, Mohamed Adel Serhani, Rachida Dss...",Big Data Quality Assessment Model for Unstruct...,"Big Data, Data integrity, Data mining, Feature...",Big Data has gained an enormous momentum the p...,2018,Conferences,10.1109/INNOVATIONS.2018.8605945,2325-5498,IEEE,ieee API
1,"Sakda Loetpipatwanich, Preecha Vichitthamaros",Sakdas: A Python Package for Data Profiling an...,"Data integrity, Pipelines, Data visualization,...",Data Profiling and data quality management bec...,2020,Conferences,10.1109/IBDAP50342.2020.9245455,,IEEE,ieee API
2,"Li Jin, Li Haosong, Xu Zhongping, Wang Ting, W...",Research on Wide-area Distributed Power Qualit...,"Power quality, Data integration, Distributed d...","With the advancement of the ""big operation"" sy...",2019,Conferences,10.1109/ICCCBDA.2019.8725668,,IEEE,ieee API
3,"Xing Pan, Manli Zhang, Xi Chen",A Method of Quality Improvement Based on Big Q...,"Warranties, Data mining, Product design, Quali...",Quality warranty data includes big data of pro...,2018,Conferences,10.1109/QRS-C.2018.00115,,IEEE,ieee API
4,"Andrew Burkhardt, Sheila Berryman, Ashley Brio...",Measuring Manufacturing Test Data Analysis Qua...,"Data integrity, Manufacturing, Measurement, De...",Manufacturing test data volumes are constantly...,2018,Conferences,10.1109/AUTEST.2018.8532518,1088-7725,IEEE,ieee API
...,...,...,...,...,...,...,...,...,...,...
395,"A.H. Moon, Ummer Iqbal Khan, Ashaq Hussain Dar...",Practical Implementation of WSN Based Data Acq...,"Wireless sensor networks, Data acquisition, Se...",WSN technology holds lot of promise in develop...,2013,Conferences,10.1109/ICMIRA.2013.21,,IEEE,ieee API
396,"Tom´ Knap, Jan Michelfeit, Martin Necaský",Linked Open Data Aggregation: Conflict Resolut...,"Aggregates, Databases, Resource description fr...",The paradigm of publishing governmental data i...,2012,Conferences,10.1109/COMPSACW.2012.29,,IEEE,ieee API
397,"Xuhui Chen, Jun Lu, Zhongyuan Liu",Assistance ontology of quality control for ent...,"Ontologies, Quality control, Data mining, Delt...",There are many quality domains in which ideas ...,2007,Conferences,10.1109/IEEM.2007.4419260,2157-362X,IEEE,ieee API
398,"Chaang-Yung Kung, Pei-Yi Yang, Tzung-Ming Yan",Applying Grey Relational Method to Analyze the...,"Quality function deployment, Medical services,...",In the context of economic level sustaining it...,2006,Conferences,10.1109/ICSMC.2006.384482,1062-922X,IEEE,ieee API
