In [13]:
import findspark
findspark.init()

# Spark & python function
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType
import pyspark.sql.functions as F
from pyspark import Row
from pyspark.sql.window import Window

import re

spark = SparkSession.builder.master("spark://master:7077")\
                        .appName("gVCF_combine")\
                        .config("spark.executor.memory", "18G")\
                        .config("spark.executor.core", "3")\
                        .config("spark.sql.shuffle.partitions", 20)\
                        .config("spark.driver.memory", "13G")\
                        .config("spark.driver.maxResultSize", "10G")\
                        .getOrCreate()

In [12]:
spark.catalog.clearCache()
spark.stop()

In [14]:
def preVCF(hdfs, flag, spark):
    vcf = spark.sparkContext.textFile(hdfs)

    # drop ---> QUAL FILTER column
    header_contig = vcf.filter(lambda x : re.match("^#", x))
    col_name = vcf.filter(lambda x : x.startswith("#CHROM")).first().split("\t")
    vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\
                       .map(lambda x : x.split("\t"))\
                       .toDF(col_name)\
                       .withColumn("POS", F.col("POS").cast(IntegerType()))\     
                       .drop(F.col("QUAL")).drop(F.col("FILTER"))

    # flag에 따라 column 명 변경
    if flag == 1:
        for index in range(len(vcf_data.columns[:9])):
            compared_arr = ["#CHROM", "POS", "REF"]
            if vcf_data.columns[index] in compared_arr:
                continue
            vcf_data = vcf_data.withColumnRenamed(vcf_data.columns[index], vcf_data.columns[index] + "_temp") 
    
    return vcf_data

def endRecalc(pos, left, right, flag=None):
    #endRecalc(left_pos, right_pos, left, right, flag):
    if left.startswith("END=") == False or right.startswith("END=") == False :
        if flag == "left":
            return left
        elif flag == "right":
            return right
        else:
            return left
    else :
        left_end, right_end = int(left.replace("END=", "")), int(right.replace("END=", ""))
        if pos == left_end or pos == right_end:
            return "."
        else :
            if left_end > right_end:
                end_position = right_end
            else:
                end_position = left_end
            end_position = "END=" + str(end_position)
            return end_position
    
def selectCol(row, sample):   
    if row["ID_temp"] == None:       
        return_row = row[:2] + (row["ID"], row["REF"], row["ALT"], ".", ".", endRecalc(row["POS"], row["INFO"], row["INFO_temp"], "left"), row["FORMAT"])     
    elif row["ID"] == None:
        return_row = row[:2] + (row["ID_temp"], row["REF"], row["ALT_temp"], ".", ".", endRecalc(row["POS"], row["INFO"], row["INFO_temp"], "right"), row["FORMAT_temp"]) 
    else:
        return_row = row[:2] + (row["ID"], row["REF"], row["ALT"], ".", ".", endRecalc(row["POS"], row["INFO"], row["INFO_temp"]), row["FORMAT"])
        
    for sample_col in sample[:-1]:
        return_row += (row[sample_col],)
        return_row += (row[-1],)
    return return_row

#def selectCol_UDF()

In [15]:
# main
left = preVCF("hdfs://master:9000/raw_data/gvcf/ND02798_eg.raw.vcf", 0, spark)
right = preVCF("hdfs://master:9000/raw_data/gvcf/ND02809_eg.raw.vcf", 1, spark)

join_vcf = left.join(right, ["#CHROM", "POS", "REF"], "full")
#join_vcf = left.join(right, ["#CHROM", "POS", "REF"], "full").cache()
#join_vcf.count()

# window
lookup_window = Window.partitionBy("#CHROM").orderBy("POS").rangeBetween(Window.unboundedPreceding, 0)

sample_col = left.columns[9:] + right.columns[9:]
header = left.columns + right.columns[9:] 
join_vcf_update = join_vcf.withColumn("INFO", F.last("INFO", ignorenulls = True).over(lookup_window))\
                                 .withColumn("INFO_temp", F.last("INFO_temp", ignorenulls = True).over(lookup_window))
for col_name in sample_col:
    join_vcf_update = join_vcf_update.withColumn(col_name, F.last(col_name, ignorenulls = True).over(lookup_window))
join_vcf_update = join_vcf_update.orderBy(F.col("#CHROM"), F.col("POS")).rdd.map(lambda row : selectCol(row, sample_col)).toDF(header).cache()
                                          
join_vcf_update.count()

81337377

In [16]:
join_vcf_update.filter(F.col("#CHROM") == "chrUn_gl000244").show()

+--------------+-----+---+---+---------+----+------+---------+------------------+-----------------+-----------------+
|        #CHROM|  POS| ID|REF|      ALT|QUAL|FILTER|     INFO|            FORMAT|          ND02798|          ND02809|
+--------------+-----+---+---+---------+----+------+---------+------------------+-----------------+-----------------+
|chrUn_gl000244|    1|  .|  G|<NON_REF>|   .|     .|END=15241|GT:DP:GQ:MIN_DP:PL|  0/0:0:0:0:0,0,0|  0/0:0:0:0:0,0,0|
|chrUn_gl000244|15242|  .|  A|<NON_REF>|   .|     .|END=15283|GT:DP:GQ:MIN_DP:PL| 0/0:3:9:2:0,6,49|  0/0:0:0:0:0,0,0|
|chrUn_gl000244|15284|  .|  G|<NON_REF>|   .|     .|END=16278|GT:DP:GQ:MIN_DP:PL|  0/0:0:0:0:0,0,0|  0/0:0:0:0:0,0,0|
|chrUn_gl000244|16279|  .|  G|<NON_REF>|   .|     .|END=16286|GT:DP:GQ:MIN_DP:PL| 0/0:2:5:2:0,6,49|  0/0:0:0:0:0,0,0|
|chrUn_gl000244|16287|  .|  G|<NON_REF>|   .|     .|END=16303|GT:DP:GQ:MIN_DP:PL| 0/0:2:3:1:0,3,32|  0/0:0:0:0:0,0,0|
|chrUn_gl000244|16304|  .|  T|<NON_REF>|   .|     .|END=

In [None]:
#join_vcf.unpersist()         
#pos_window= Window.partitionBy("#CHROM").orderBy("POS").rangeBetween(Window.unboundedPreceding, -1)
#join_vcf_update.withColumn("INFO_re", F.last("POS", ignorenulls = True).over(pos_window)).show(3)

In [18]:
join_vcf_update.show(14000)

+------+------+-----------+-----+--------------------+----+------+--------------------+------------------+--------------------+--------------------+
|#CHROM|   POS|         ID|  REF|                 ALT|QUAL|FILTER|                INFO|            FORMAT|             ND02798|             ND02809|
+------+------+-----------+-----+--------------------+----+------+--------------------+------------------+--------------------+--------------------+
|  chr1|     1|          .|    N|           <NON_REF>|   .|     .|           END=10081|GT:DP:GQ:MIN_DP:PL|     0/0:0:0:0:0,0,0|     0/0:0:0:0:0,0,0|
|  chr1| 10082|          .|    C|           <NON_REF>|   .|     .|           END=10098|GT:DP:GQ:MIN_DP:PL|    0/0:2:5:2:0,6,49|     0/0:0:0:0:0,0,0|
|  chr1| 10099|          .|    A|           <NON_REF>|   .|     .|           END=10106|GT:DP:GQ:MIN_DP:PL|    0/0:2:5:2:0,6,49|    0/0:2:5:2:0,6,49|
|  chr1| 10107|          .|    C|           <NON_REF>|   .|     .|           END=10108|GT:DP:GQ:MIN_DP:PL|

In [31]:
gvcf_list[0].unpersist()
gvcf_list[1].unpersist()
gvcf_list[2].unpersist()

DataFrame[#CHROM: string, POS: int, ID: string, REF: string, ALT: string, QUAL: string, FILTER: string, INFO: string, FORMAT: string, ND03490: string]

In [32]:

gvcf_list[0].join(gvcf_list[1], ["#CHROM", "POS", "REF"], "full")\
            .join(gvcf_list[2], ["#CHROM", "POS", "REF"], "full").show()

+------+-----+---+----------+-----------+------+------+--------------------+------------------+--------------------+----------+-----------+------+------+--------------------+------------------+--------------------+----------+-----------+-------+------+--------------------+------------------+--------------------+
|#CHROM|  POS|REF|        ID|        ALT|  QUAL|FILTER|                INFO|            FORMAT|             ND02798|        ID|        ALT|  QUAL|FILTER|                INFO|            FORMAT|             ND02809|        ID|        ALT|   QUAL|FILTER|                INFO|            FORMAT|             ND03490|
+------+-----+---+----------+-----------+------+------+--------------------+------------------+--------------------+----------+-----------+------+------+--------------------+------------------+--------------------+----------+-----------+-------+------+--------------------+------------------+--------------------+
|  chr1|10385|  C|         .|  <NON_REF>|     .|     .|   