<a href="https://colab.research.google.com/github/Guisilcol/spark-for-data-enginners/blob/main/Spark_For_Data_Enginners_Execicio_Aula_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalação de módulos 



In [None]:
!pip install pyspark
!pip install bibtexparser


# Importação de módulos e inicialização de constantes

In [None]:
import pyspark
import pyspark.sql as spark_sql
import pyspark.sql.functions as F
import typing as python_types
import pyspark.sql.types as spark_types

ACM_BIBTEXS_INPUT_FOLDER = '/content/drive/MyDrive/Impacta/Spark for Data Enginners/Exercicio Aula 3/data/acm/*'
IEEE_BIBTEXS_INPUT_FOLDER = '/content/drive/MyDrive/Impacta/Spark for Data Enginners/Exercicio Aula 3/data/ieee/*'
SCIENCE_DIRECT_BIBTEXS_INPUT_FOLDER = '/content/drive/MyDrive/Impacta/Spark for Data Enginners/Exercicio Aula 3/data/science_direct/*'

JCR_FILEPATH = '/content/drive/MyDrive/Impacta/Spark for Data Enginners/Exercicio Aula 3/data/jcs_2020.csv'
SCIMAGO_FILEPATH = '/content/drive/MyDrive/Impacta/Spark for Data Enginners/Exercicio Aula 3/data/scimagojr 2020.csv'

JSON_OUTPUT_FILE = '/content/drive/MyDrive/Impacta/Spark for Data Enginners/Exercicio Aula 3/output3'


SPARK = spark_sql.SparkSession.builder.master("local[*]").getOrCreate()

# UDFs e Python Functions

In [None]:
from glob import glob as get_filenames_in_folder
from bibtexparser import load as load_bibtex_database

class BibtexHandler:
  
  @staticmethod
  def parse_bibtex_folder_in_dict_list(folder_path: str):
    filepaths = get_filenames_in_folder(folder_path)
    files = (open(path, "r", encoding="utf-8") for path in filepaths)  # type: ignore
    bibtexts = (load_bibtex_database(file) for file in files)
    bib_entries = (bib.entries for bib in bibtexts)
    return list((item for sublist in bib_entries for item in sublist))

class Operations:

  @staticmethod
  def generate_key_from_journal_name(df: spark_sql.DataFrame, journal_title_column: str, output_column: str):
    return df.withColumn(output_column, F.regexp_replace(journal_title_column, "&", "AND"))\
              .withColumn(output_column, F.regexp_replace(output_column, r"([^A-Za-z0-9]+)", ""))\
              .withColumn(output_column, F.upper(output_column))\
              .withColumn(output_column, F.trim((output_column)))





# Extração de dados (Python side)

In [None]:
ACM_INPUT_DATA: python_types.List[dict] = BibtexHandler.parse_bibtex_folder_in_dict_list(ACM_BIBTEXS_INPUT_FOLDER)
IEEE_INPUT_DATA: python_types.List[dict] = BibtexHandler.parse_bibtex_folder_in_dict_list(IEEE_BIBTEXS_INPUT_FOLDER)
SCIENCE_DIRECT_INPUT_DATA: python_types.List[dict] = BibtexHandler.parse_bibtex_folder_in_dict_list(SCIENCE_DIRECT_BIBTEXS_INPUT_FOLDER)

# Extração e Transformação de dados

---



In [None]:
########################################## Criação dos Dataframes ##########################################
ACM_DF = SPARK.createDataFrame(ACM_INPUT_DATA).cache() # type: ignore
IEEE_DF = SPARK.createDataFrame(IEEE_INPUT_DATA).cache() # type: ignore
SCIENCE_DIRECT_DF = SPARK.createDataFrame(SCIENCE_DIRECT_INPUT_DATA).cache() # type: ignore

JCS_DF = SPARK.read.csv(JCR_FILEPATH, sep = ";", header = True)
SCIMAGO_DF = SPARK.read.csv(SCIMAGO_FILEPATH, sep = ";", header = True)

In [None]:



########################################## Tratamento inicial dos bibtexs ##########################################
acm_df = ACM_DF.select(["author", "title", "keywords", "abstract", "year", "ENTRYTYPE", "doi", "issn", "isbn", "journal"])\
        .withColumnRenamed("ENTRYTYPE", "type_publication")\
        .withColumn("source", F.lit("acm"))\
        .withColumn("issn", F.regexp_replace("issn", "-", ""))\
        .dropDuplicates()

ieee_df = IEEE_DF.select(["author", "title", "keywords", "abstract", "year", "ENTRYTYPE", "doi", "issn", "journal"])\
          .withColumnRenamed("ENTRYTYPE", "type_publication")\
          .withColumn("source", F.lit("ieee"))\
          .withColumn("issn", F.regexp_replace("issn", "-", ""))\
          .withColumn("isbn", F.lit(None).cast(spark_types.StringType()))\
          .dropDuplicates()

science_direct_df = SCIENCE_DIRECT_DF.select(["author", "title", "keywords", "abstract", "year", "ENTRYTYPE", "doi", "issn", "isbn", "journal"])\
                    .withColumnRenamed("ENTRYTYPE", "type_publication")\
                    .withColumn("source", F.lit("science direct"))\
                    .withColumn("issn", F.regexp_replace("issn", "-", ""))\
                    .dropDuplicates()

########################################## Union dos bibtex ##########################################

bibtex_df = acm_df.union(ieee_df).union(science_direct_df)

In [None]:
########################################## Tratamento dos Bibtex ##########################################
bibtex_df = bibtex_df.transform(Operations.generate_key_from_journal_name, 
                                  journal_title_column = 'journal', 
                                  output_column = 'journal_name_key')\
                      .withColumn('doi', F.regexp_replace('doi', 'https://doi.org/', ''))

########################################## Transformação dos arquivos CSV ##########################################
scimago_df = SCIMAGO_DF.select(['Issn', 'Title', 'SJR'])\
                        .withColumnRenamed('SJR', 'scimago_value')\
                        .withColumnRenamed('Issn', 'issn')\
                        .withColumnRenamed('Title', 'scimago_title')\
                        .transform(Operations.generate_key_from_journal_name, 
                                  journal_title_column = 'scimago_title', 
                                  output_column = 'journal_name_key')
                        
jcs_df = JCS_DF.select(["Full Journal Title", "Journal Impact Factor"])\
                .withColumnRenamed("Full Journal Title", "jcs_title")\
                .withColumnRenamed("Journal Impact Factor", "jcs_value")\
                .transform(Operations.generate_key_from_journal_name, 
                                  journal_title_column = 'jcs_title', 
                                  output_column = 'journal_name_key')

########################################## JOIN dos arquivos CSV ##########################################
journal_df = scimago_df.alias("scimago_df")\
            .join(jcs_df, 'journal_name_key', 'outer')\
            .withColumn('title', F.coalesce("scimago_title", "jcs_title"))\
            .withColumn("issn", F.regexp_replace("issn", " ",""))\
            .select(['title', 'journal_name_key', 'issn', 'scimago_value', 'jcs_value'])\
            .dropDuplicates(["journal_name_key"])
            
journal_df = journal_df.withColumn("issn", F.when(journal_df["issn"] == '-', None).otherwise(journal_df["issn"]))

splited_issn = F.split(journal_df["issn"], ",")

journal_df = journal_df.withColumn('issn_1', splited_issn.getItem(0))\
                        .withColumn('issn_2', splited_issn.getItem(1))\
                        .withColumn('issn_3', splited_issn.getItem(3))\
                        .select(['title', 'journal_name_key', 'issn_1', 'issn_2', 'issn_3', 'scimago_value', 'jcs_value'])


In [None]:
########################################## JOIN entre Bibtexs e CSV's ##########################################
journal_df.createOrReplaceTempView("journal_df")
bibtex_df.createOrReplaceTempView("bibtex_df")


df_final = SPARK.sql(
"""
      SELECT DISTINCT
            bib.author
            ,bib.title
            ,bib.keywords
            ,bib.abstract
            ,bib.year
            ,bib.type_publication
            ,bib.doi
            ,bib.issn
            ,COALESCE(bib.journal, jor.title) journal
            ,bib.source
            ,jor.scimago_value  
            ,jor.jcs_value
      FROM 
            bibtex_df bib
            LEFT JOIN journal_df jor
                  ON (bib.issn = jor.issn_1
                        OR bib.issn = jor.issn_2
                        OR bib.issn = jor.issn_3
                        OR bib.journal_name_key = jor.journal_name_key)
                  AND bib.issn is not null 
                  AND bib.journal is not null;""")

journal_df.unpersist()
bibtex_df.unpersist()




DataFrame[author: string, title: string, keywords: string, abstract: string, year: string, type_publication: string, doi: string, issn: string, isbn: string, journal: string, source: string, journal_name_key: string]