In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType, DateType, LongType, DoubleType

import pyspark
from pyspark.sql import functions as F
from pyspark.sql.functions import lit, when, col
import pandas as pd
from pyspark.sql import Window

import os

In [2]:
spark = (
    SparkSession
    .builder 
    .appName("Preprocessing SDM") 
    .config("spark.driver.memory", "1g") 
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
24/04/09 19:40:04 WARN Utils: Your hostname, MyLaptop resolves to a loopback address: 127.0.1.1; using 172.28.44.164 instead (on interface eth0)
24/04/09 19:40:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/09 19:40:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
root_directory =  "/mnt/c/MDS/Q2/SDM/data/data"

### Read data

#### Article

In [4]:
def _create_schema(headers):
    schema = []
    for columna, tipo in headers:
        if tipo in ['int', 'ID']: spark_dtype = LongType()
        elif str(tipo).startswith("string"): spark_dtype = StringType()
        else: spark_dtype = StringType()

        if columna == '': columna = tipo
    
        schema.append(StructField(columna, spark_dtype, True))
    return StructType(schema)


def read_data_w_sep_headers(name_data: str = "dblp_www"):
    headers = pd.read_csv(f"{root_directory}/{name_data}_header.csv", delimiter=";").columns
    headers = map(lambda x: str(x).split(":"), headers)
    
    schema = _create_schema(headers)

    df = spark.read.schema(schema).options(delimiter=";").csv(f"{root_directory}/{name_data}.csv")
    
    return df

def read_data_w_embedded_headers(name_data: str = "dblp_www"):
    headers = pd.read_csv(f"{root_directory}/{name_data}.csv", delimiter=";",nrows=1).columns
    headers = map(lambda x: str(x).split(":"), headers)
    
    schema = _create_schema(headers)

    df = spark.read.schema(schema).options(delimiter=";", header="true").csv(f"{root_directory}/{name_data}.csv")

    return df


def create_fake_email(df: pyspark.sql.DataFrame):
    df = (
        df
        .withColumn("cleaned_name", F.regexp_replace(F.lower("name"), "[^a-zA-Z0-9\s]", ""))
        .withColumn("email", F.concat(F.regexp_replace("cleaned_name", " ", "."), lit("@gmail.com")))
    )

    return df


    

In [5]:
article = read_data_w_sep_headers(name_data="dblp_article")
book = read_data_w_sep_headers(name_data="dblp_book")
data = read_data_w_sep_headers(name_data="dblp_data")
incollection = read_data_w_sep_headers(name_data="dblp_incollection")
inproceedings = read_data_w_sep_headers(name_data="dblp_inproceedings")
mastersthesis = read_data_w_sep_headers(name_data="dblp_mastersthesis")
phdthesis = read_data_w_sep_headers(name_data="dblp_phdthesis")
proceedings = read_data_w_sep_headers(name_data="dblp_proceedings")
www = read_data_w_sep_headers(name_data="dblp_www")


In [6]:
author = read_data_w_embedded_headers(name_data="dblp_author")
authored_by = read_data_w_embedded_headers(name_data="dblp_author_authored_by")
journal = read_data_w_embedded_headers(name_data="dblp_journal")
journal_published_in = read_data_w_embedded_headers(name_data="dblp_journal_published_in")


In [7]:
tables = [article, book, data, incollection, inproceedings, mastersthesis, phdthesis, proceedings, www]
name_tables = ["article", "book", "data", "incollection", "inproceedings", "mastersthesis", "phdthesis", "proceedings", "www"]

#### Creation of authors data

In [8]:

authors = article.withColumn("name", F.explode(F.split("author", "\|"))).select("name").distinct()
authors_data = create_fake_email(df=authors)

#### Creation of article data

In [9]:
article_data = article.select(
    col("article").alias("id"),
    col("author").alias("author_name"),
    "journal",
    "title", 
    "url",
    "volume",
    "year"
)

#### Creation of journal data

In [10]:
journal_data = article.select(
    "journal"
).dropDuplicates()

In [11]:
w = Window().orderBy("n_articles")
journal_rank = (
    article
    .groupBy("journal")
    .agg(F.countDistinct("article").alias("n_articles"))
    .orderBy(col("n_articles").desc())
    .limit(200)
    .withColumn("rank", F.row_number().over(w))
    .select("rank", "journal")
)

In [12]:
journal_data_w_rank = journal_data.join(journal_rank, on="journal", how="left")

#### Creation of time data

In [13]:
time = article.select("year").dropDuplicates()

### Creation of conferences information

In [14]:
proceedings_ = (
    proceedings
    .withColumn("type", F.split("url", "\/")[1])
    .withColumn("conference_name", F.split("url", "\/")[2])
    .withColumn("edition", F.concat_ws("-","conference_name", "year"))
    .filter(col("type") == 'conf')
    .select("type","conference_name","edition", "editor", "year")
)

In [15]:
inproceedings_ = (
    inproceedings
    .withColumn("type", F.split("url", "\/")[1])
    .withColumn("conference_name", F.split("url", "\/")[2])
    .withColumn("edition", F.concat_ws("-","conference_name", "year"))
    .filter(col("type") == 'conf')
    .select("type","conference_name","edition", "editor", "year")
)

In [16]:
conference_information = proceedings_.union(inproceedings_)

In [17]:
cities = [ 
    "New York City, USA", "London, UK", "Tokyo, Japan", "Paris, France", "Los Angeles, USA", "Beijing, China", "Moscow, Russia", "Istanbul, Turkey", "Sao Paulo, Brazil",
    "Cairo, Egypt", "Mumbai, India", "Mexico City, Mexico", "Seoul, South Korea", "Jakarta, Indonesia", "Karachi, Pakistan", "Buenos Aires, Argentina", "Delhi, India", 
    "Shanghai, China", "Manila, Philippines", "Dhaka, Bangladesh", "Moscow, Russia", "Istanbul, Turkey", "Tianjin, China", "Rio de Janeiro, Brazil", "Lagos, Nigeria", "Lima, Peru", 
    "Bangkok, Thailand", "Jakarta, Indonesia", "Cairo, Egypt", "Bogota, Colombia", "Kinshasa, Democratic Republic of the Congo", "Seoul, South Korea", "Dhaka, Bangladesh", "Karachi, Pakistan", 
    "Tokyo, Japan", "Manila, Philippines", "Guangzhou, China", "Mumbai, India", "Istanbul, Turkey", "Moscow, Russia", "Sao Paulo, Brazil", "Beijing, China", "Lahore, Pakistan", 
    "Shenzhen, China", "Chongqing, China", "Chengdu, China", "Lahore, Pakistan", "Kinshasa, Democratic Republic of the Congo", "Bangalore, India", "Taipei, Taiwan"
]


In [18]:
w = Window.orderBy("city")
cities = spark.createDataFrame([(city,) for city in cities], ["city"]).withColumn("number", F.row_number().over(w))

In [19]:
conference_information_w_random_city = (
    conference_information
    .withColumn("number", F.floor(F.rand()*50))
    .join(F.broadcast(cities), on="number", how="left")
    .dropDuplicates(['edition'])
)

#### Create conference data

In [20]:
conference_data = conference_information_w_random_city.select("conference_name").dropDuplicates()

#### Create edition data

In [21]:
edition_data = conference_information_w_random_city.select("conference_name", col("edition").alias("conference_edition"), "city")

#### Create city

In [22]:
city_data = cities.drop("number")

#### Create random link between article data and conference edition

In [23]:
conf_info_random = conference_information_w_random_city.withColumn("number", F.round(F.rand(), 3)).select("edition","year", "number")
article_data_w_conf_info = article_data.withColumn("number", F.round(F.rand(), 3)).join(conf_info_random, on=["number","year"], how="left").drop("number")

In [24]:
reviewers_data = conference_information_w_random_city.select(F.explode(F.split("editor", "\|")).alias("reviewer"), "edition").dropDuplicates()

#### Create random link between reviewers and articles

In [25]:
link_reviewer_article = (
    article_data_w_conf_info
    .withColumn("number", F.round(F.rand()/2,1)).select("ID","edition", "number")
    .join(reviewers_data.withColumn("number", F.round(F.rand()/2,1)), on=["edition","number"], how="left")
)

In [26]:
link_reviewer_article = link_reviewer_article.select("ID","reviewer")

### Persist to disk

In [27]:
authors_data.write.mode("overwrite").parquet(f"../temporal_zone/authors_data")
article_data_w_conf_info.write.mode("overwrite").parquet(f"../temporal_zone/article_data")
journal_data_w_rank.write.mode("overwrite").parquet(f"../temporal_zone/journal_data_w_rank")
time.write.mode("overwrite").parquet(f"../temporal_zone/time")
conference_data.write.mode("overwrite").parquet(f"../temporal_zone/conference_data")
edition_data.write.mode("overwrite").parquet(f"../temporal_zone/edition_data")
city_data.write.mode("overwrite").parquet(f"../temporal_zone/city_data")
reviewers_data.write.mode("overwrite").parquet(f"../temporal_zone/reviewers_data")
link_reviewer_article.write.mode("overwrite").parquet(f"../temporal_zone/link_reviewer_article")


24/04/09 19:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/09 19:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/09 19:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/09 19:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/09 19:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/09 19:32:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/09 19:32:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/09 19:32:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/09 19:32:29 WARN RowBasedKeyValueBatch: Calling spill() on

In [None]:
import shutil
import os

# Define a list of file paths
file_paths = [
    "../temporal_zone/authors_data",
    "../temporal_zone/article_data",
    "../temporal_zone/journal_data_w_rank",
    "../temporal_zone/time",
    "../temporal_zone/conference_data",
    "../temporal_zone/edition_data",
    "../temporal_zone/city_data",
    "../temporal_zone/reviewers_data",
    "../temporal_zone/link_reviewer_article"
]

# Loop through each file path
for file_path in file_paths:
    # Write the DataFrame to CSV
    spark.read.parquet(file_path).coalesce(1).write.mode("overwrite").csv(f"{file_path}_tmp")

    # Get the path of the single CSV file
    csv_file_path = f"{file_path}_tmp/*.csv"
    files = os.listdir(f"{file_path}_tmp")
    for file in files:
        if file.endswith(".csv"):
            csv_file_path = os.path.join(f"{file_path}_tmp", file)
            break

    # Rename the CSV file
    os.rename(csv_file_path, f"{file_path}.csv")

    # Delete the temporary folder
    shutil.rmtree(f"{file_path}_tmp")