In [43]:
spark.catalog.clearCache()
spark.stop()

In [44]:
import findspark
findspark.init()

# Spark & python function
import pandas
import pyarrow
from pyspark.sql.functions import pandas_udf, udf
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType
import pyspark.sql.functions as F
from pyspark.sql.functions import when
from pyspark import Row
from pyspark.sql.window import Window
from pyspark import StorageLevel

import re
import subprocess

spark = SparkSession.builder.master("spark://master:7077")\
                        .appName("gVCF_combine")\
                        .config("spark.executor.memory", "24G")\
                        .config("spark.executor.core", "3")\
                        .config("spark.sql.shuffle.partitions",20)\
                        .config("spark.driver.memory", "8G")\
                        .config("spark.driver.maxResultSize", "8G")\
                        .config("spark.sql.execution.arrow.enabled", "true")\
                        .getOrCreate()

In [45]:
def preVCF(hdfs, flag, spark):
    vcf = spark.sparkContext.textFile(hdfs)
    # drop ---> QUAL FILTER column
    header_contig = vcf.filter(lambda x : re.match("^#", x))
    col_name = vcf.filter(lambda x : x.startswith("#CHROM")).first().split("\t")
    vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\
                       .map(lambda x : x.split("\t"))\
                       .toDF(col_name)\
                       .withColumn("POS", F.col("POS").cast(IntegerType()))
    
    if flag >= 1:
        for index in range(len(vcf_data.columns[:9])):
            compared_arr = ["#CHROM", "POS", "REF"]
            if vcf_data.columns[index] in compared_arr:
                continue
            vcf_data = vcf_data.withColumnRenamed(vcf_data.columns[index], vcf_data.columns[index] + "_temp") 
    
    return vcf_data

def hadoop_list(length, hdfs):
    args = "hdfs dfs -ls "+ hdfs +" | awk '{print $8}'"
    proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    s_output, s_err = proc.communicate()
    all_dart_dirs = s_output.split()
    
    return all_dart_dirs[:length]

def selectNotNull(left, right):
    if left == None:
        return right
    else:
        return left
    
def qual_filter(none = None):
    return "."

selectNotNull_u = udf(selectNotNull, returnType=StringType())
qual_filter_u = udf(qual_filter, returnType=StringType())

In [46]:
# main
hdfs = "hdfs://master:9000"
hdfs_list = hadoop_list(2, "/raw_data/gvcf")
gvcf_list = []
sample_name = []
sample_name_update =[]
sample_list = []
gvcf_combine_result = []

for index in range(len(hdfs_list)):
    if index == 0:
        gvcf = preVCF(hdfs + hdfs_list[index].decode("UTF-8"), 0, spark)
        sample_name.append(gvcf.columns[-1])
        gvcf_list.append(gvcf)
    else:
        gvcf = preVCF(hdfs + hdfs_list[index].decode("UTF-8"), 1, spark)
        sample_name.append(gvcf.columns[-1])
        gvcf_list.append(gvcf)
        
# window
w = Window.partitionBy("#CHROM").orderBy("POS")
sample_w = Window.partitionBy("#CHROM").orderBy("POS").rangeBetween(Window.unboundedPreceding, Window.currentRow)        
        
for index in range(1, len(hdfs_list)):
    if index == 1:
        temp = gvcf_list[0].join(gvcf_list[index], ["#CHROM", "POS", "REF"], "full")\
            .withColumn("ID", selectNotNull_u(F.col("ID"), F.col("ID_temp")))\
            .withColumn("ALT", selectNotNull_u(F.col("ALT"), F.col("ALT_temp")))\
            .withColumn("FORMAT", selectNotNull_u(F.col("FORMAT"), F.col("FORMAT_temp")))\
            .withColumn("QUAL", qual_filter_u()).withColumn("FILTER", qual_filter_u())\
            .withColumn("INFO", when(F.col("INFO").startswith("END") == False, F.col("INFO"))\
                        .when(F.col("INFO_temp").startswith("END") == False, F.col("INFO_temp"))\
                        .otherwise(F.lead("POS", 1).over(w) - 1))
    else:
        temp = gvcf_combine_result[index - 2].join(gvcf_list[index], ["#CHROM", "POS", "REF"], "full")\
            .withColumn("ID", selectNotNull_u(F.col("ID"), F.col("ID_temp")))\
            .withColumn("ALT", selectNotNull_u(F.col("ALT"), F.col("ALT_temp")))\
            .withColumn("FORMAT", selectNotNull_u(F.col("FORMAT"), F.col("FORMAT_temp")))\
            .withColumn("QUAL", qual_filter_u()).withColumn("FILTER", qual_filter_u())\
            .withColumn("FILTER", when(F.col("FILTER").isNotNull(), "."))\
            .withColumn("INFO", when(F.col("INFO").startswith("END") == False, F.col("INFO"))\
                        .when(F.col("INFO_temp").startswith("END") == False, F.col("INFO_temp"))\
                        .otherwise(F.lead("POS", 1).over(w) - 1))
    
        
    temp = temp.drop("INFO_temp", "ID_temp", "ALT_temp", "FORMAT_temp", "QUAL_temp", "FILTER_temp")
    
    
    for name in range(index+1):
        temp = temp.withColumn(sample_name[name], F.last(sample_name[name], ignorenulls=True).over(sample_w))
        
    gvcf_combine_result.append(temp.orderBy(F.col("#CHROM"), F.col("POS")).cache())
    gvcf_combine_result[index-1].count()
                               
    
    if index != 1:
        gvcf_combine_result[index - 2].unpersist()

In [58]:
gvcf_combine_result[0].select([column for column in gvcf_combine_result[0].columns if column in ["#CHROM", "POS"] + sample_name[:3]]).show()

+------+-----+--------------------+----------------+
|#CHROM|  POS|             ND02798|         ND02809|
+------+-----+--------------------+----------------+
|  chr1|    1|     0/0:0:0:0:0,0,0| 0/0:0:0:0:0,0,0|
|  chr1|10082|    0/0:2:5:2:0,6,49| 0/0:0:0:0:0,0,0|
|  chr1|10099|    0/0:2:5:2:0,6,49|0/0:2:5:2:0,6,49|
|  chr1|10107|    0/0:2:5:2:0,6,49| 0/0:0:0:0:0,0,0|
|  chr1|10109|     0/0:1:0:0:0,0,0| 0/0:0:0:0:0,0,0|
|  chr1|10159|   0/0:4:11:2:0,6,49| 0/0:0:0:0:0,0,0|
|  chr1|10178|     0/0:1:0:0:0,0,0| 0/0:0:0:0:0,0,0|
|  chr1|10385|   0/0:4:12:2:0,6,74| 0/0:0:0:0:0,0,0|
|  chr1|10398|     0/0:2:0:0:0,0,0| 0/0:0:0:0:0,0,0|
|  chr1|10462|    0/0:4:9:2:0,6,60| 0/0:0:0:0:0,0,0|
|  chr1|10501|   0/0:6:0:6:0,0,117| 0/0:0:0:0:0,0,0|
|  chr1|10502| 0/0:6:15:6:0,12,180| 0/0:0:0:0:0,0,0|
|  chr1|10511| 0/0:7:21:7:0,21,227| 0/0:0:0:0:0,0,0|
|  chr1|10515| 0/0:6:18:6:0,18,213| 0/0:0:0:0:0,0,0|
|  chr1|10516|0/0:15:36:8:0,21,315| 0/0:0:0:0:0,0,0|
|  chr1|10560| 0/0:13:0:13:0,0,333| 0/0:0:0:0: