In [None]:
import os
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.functions import array
from pyspark.sql.types import FloatType
from pyspark.sql.functions import expr

from utilities import save_dataset

In [None]:
WORK_DIRECTORY = "data"
DATA_FILE_NAME = "leukemia_met"
DATA_FILE_NAME_NEW = "leukemia_met_trans"

In [None]:
# create spark session
spark = (
    SparkSession.builder.master("local")
    .appName("Colab")
    .config("spark.driver.memory", "32g")
    .config("spark.executor.cores", "1")
    .config("spark.cores.max", "1")
    .config("spark.memory.offHeap.enabled", "true")
    .config("spark.memory.offHeap.size", "100g")
    .config("spark.sparkContext.setLogLevel", "ERROR")
    .getOrCreate()
)

spark.conf.set("spark.sql.pivotMaxValues", 500000)

In [None]:
# create spark dataframe
df = spark.read.csv(
    os.path.join(WORK_DIRECTORY, DATA_FILE_NAME),
    header=True,
    inferSchema=True,
    sep=";",
)

In [None]:
class DataTranspose:
    """
    Class transform data {index:methylation_site, col:sample_id}
            to data {index:sample_id, cols:methylation_site}.

    Methods:
    1. unpivot_data: method transform 2D dataset to 1D dataset.
    2. pivot_data: method transform 1D dataset to 2D dataset.

    """

    def __init__(self, data):
        self.data = data
        self.columns = data.columns[1:-1]

    def unpivot_data(self):
        cols_expr = ", ".join(["".join(["'", col, "', ", col]) for col in self.columns])
        unpivotExpr = "".join(
            ["stack(", str(len(self.columns)), ", ", cols_expr, ") as (s,v)"]
        )
        self.data = self.data.select("sample_id", expr(unpivotExpr)).where(
            "v is not null"
        )
        self.data = self.data.withColumn("v", self.data.v.cast("double"))

    def pivot_data(self):
        self.data = self.data.groupBy("s").pivot("sample_id").max("v")

In [None]:
# transform data
data_tr = DataTranspose(df)
data_tr.unpivot_data()
data_tr.pivot_data()

In [None]:
# get transformed dataset
df = data_tr.data

In [None]:
# save the dataset to CSV
save_dataset(df, os.path.join(WORK_DIRECTORY, DATA_FILE_NAME_NEW))