In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType, DateType, LongType, DoubleType

from pyspark.sql import functions as F
from pyspark.sql.functions import lit, when, col
import pandas as pd

import os

In [4]:
spark = (
    SparkSession
    .builder 
    .appName("Preprocessing SDM") 
    .config("spark.driver.memory", "2g") 
    .getOrCreate()
)

In [20]:
root_directory =  "/mnt/c/MDS/Q2/SDM/data/data"

### Read data

#### Article

In [22]:
def _create_schema(headers):
    schema = []
    for columna, tipo in headers:
        if tipo in ['int', 'ID']: spark_dtype = LongType()
        elif str(tipo).startswith("string"): spark_dtype = StringType()
        else: spark_dtype = StringType()

        if columna == '': columna = tipo
    
        schema.append(StructField(columna, spark_dtype, True))
    return StructType(schema)


def read_data_w_sep_headers(name_data: str = "dblp_www"):
    headers = pd.read_csv(f"{root_directory}/{name_data}_header.csv", delimiter=";").columns
    headers = map(lambda x: str(x).split(":"), headers)
    
    schema = _create_schema(headers)

    df = spark.read.schema(schema).options(delimiter=";").csv(f"{root_directory}/{name_data}.csv")
    
    return df

def read_data_w_embedded_headers(name_data: str = "dblp_www"):
    headers = pd.read_csv(f"{root_directory}/{name_data}.csv", delimiter=";",nrows=1).columns
    headers = map(lambda x: str(x).split(":"), headers)
    
    schema = _create_schema(headers)

    df = spark.read.schema(schema).options(delimiter=";", header="true").csv(f"{root_directory}/{name_data}.csv")

    return df
    

In [23]:
article = read_data_w_sep_headers(name_data="dblp_article")
book = read_data_w_sep_headers(name_data="dblp_book")
data = read_data_w_sep_headers(name_data="dblp_data")
incollection = read_data_w_sep_headers(name_data="dblp_incollection")
inproceedings = read_data_w_sep_headers(name_data="dblp_inproceedings")
mastersthesis = read_data_w_sep_headers(name_data="dblp_mastersthesis")
phdthesis = read_data_w_sep_headers(name_data="dblp_phdthesis")
proceedings = read_data_w_sep_headers(name_data="dblp_proceedings")
www = read_data_w_sep_headers(name_data="dblp_www")


In [24]:
author = read_data_w_embedded_headers(name_data="dblp_author")
authored_by = read_data_w_embedded_headers(name_data="dblp_author_authored_by")
journal = read_data_w_embedded_headers(name_data="dblp_journal")
journal_published_in = read_data_w_embedded_headers(name_data="dblp_journal_published_in")


In [25]:
tables = [article, book, data, incollection, inproceedings, mastersthesis, phdthesis, proceedings, www]
name_tables = ["article", "book", "data", "incollection", "inproceedings", "mastersthesis", "phdthesis", "proceedings", "www"]

In [27]:
check1 = proceedings.select("title", lit("X").alias("proceedings"))
check2 = article.select("title", lit("X").alias("article"))



In [None]:
(
    check1
    .join(check2, how="full", on="title")
    .groupBy("article", "proceedings")
    .agg(F.count("*")).show()

[Stage 1:>                                                        (0 + 12) / 12]