In [141]:
import findspark
findspark.init()

# Spark & python function
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType
import pyspark.sql.functions as F
from pyspark.sql.functions import when
from pyspark import Row
from pyspark.sql.window import Window
from pyspark import StorageLevel

import re
import subprocess

spark = SparkSession.builder.master("spark://master:7077")\
                        .appName("gVCF_combine")\
                        .config("spark.executor.memory", "18G")\
                        .config("spark.executor.core", "3")\
                        .config("spark.sql.shuffle.partitions",20)\
                        .config("spark.driver.memory", "13G")\
                        .config("spark.driver.maxResultSize", "10G")\
                        .getOrCreate()

In [140]:
spark.catalog.clearCache()
spark.stop()

In [142]:
def preVCF(hdfs, flag, spark):
    vcf = spark.sparkContext.textFile(hdfs)
    # drop ---> QUAL FILTER column
    header_contig = vcf.filter(lambda x : re.match("^#", x))
    col_name = vcf.filter(lambda x : x.startswith("#CHROM")).first().split("\t")
    vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\
                       .map(lambda x : x.split("\t"))\
                       .toDF(col_name)\
                       .withColumn("POS", F.col("POS").cast(IntegerType()))\
                       .drop(F.col("QUAL")).drop(F.col("FILTER"))
    
    if flag >= 1:
        for index in range(len(vcf_data.columns[:7])):
            compared_arr = ["#CHROM", "POS", "REF"]
            if vcf_data.columns[index] in compared_arr:
                continue
            vcf_data = vcf_data.withColumnRenamed(vcf_data.columns[index], vcf_data.columns[index] + "_temp") 
    
    return vcf_data

def endRecalc(pos, left, right, flag=None):
    #endRecalc(left_pos, right_pos, left, right, flag):
    if left.startswith("END=") == False or right.startswith("END=") == False :
        if flag == "left":
            return left
        elif flag == "right":
            return right
        else:
            return left
    else :
        left_end, right_end = int(left.replace("END=", "")), int(right.replace("END=", ""))
        if pos == left_end or pos == right_end:
            return "."
        else :
            if left_end > right_end:
                end_position = right_end
            else:
                end_position = left_end
            end_position = "END=" + str(end_position)
            return end_position
    
def selectCol(row, sample):   
    if row["ID_temp"] == None:       
        return_row = row[:2] + (row["ID"], row["REF"], row["ALT"], ".", ".", endRecalc(row["POS"], row["INFO"], row["INFO_temp"], "left"), row["FORMAT"])     
    elif row["ID"] == None:
        return_row = row[:2] + (row["ID_temp"], row["REF"], row["ALT_temp"], ".", ".", endRecalc(row["POS"], row["INFO"], row["INFO_temp"], "right"), row["FORMAT_temp"]) 
    else:
        return_row = row[:2] + (row["ID"], row["REF"], row["ALT"], ".", ".", endRecalc(row["POS"], row["INFO"], row["INFO_temp"]), row["FORMAT"])
        
    for sample_col in sample[:-1]:
        return_row += (row[sample_col],)
        return_row += (row[-1],)
    return return_row

def hadoop_list(length, hdfs):
    args = "hdfs dfs -ls "+ hdfs +" | awk '{print $8}'"
    proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    s_output, s_err = proc.communicate()
    all_dart_dirs = s_output.split()
    
    return all_dart_dirs[:length]

In [143]:
# main
hdfs = "hdfs://master:9000"
hdfs_list = hadoop_list(2, "/raw_data/gvcf")
gvcf_list = []
gvcf_combine_result = []

for index in range(len(hdfs_list)):
    if index == 0:
        gvcf_list.append(preVCF(hdfs + hdfs_list[index].decode("UTF-8"), 0, spark))
    else:
        gvcf_list.append(preVCF(hdfs + hdfs_list[index].decode("UTF-8"), 1, spark))

for index in range(1, len(hdfs_list)):
    if index == 1:
        join_vcf = gvcf_list[0].join(gvcf_list[index], ["#CHROM", "POS", "REF"], "full")
    else :
        join_vcf = gvcf_combine_result[index - 2].join(gvcf_list[index], ["#CHROM", "POS", "REF"], "full")
        
    # window
    #lookup_window = Window.partitionBy("#CHROM").rangeBetween(Window.unboundedPreceding, 0)
    lookup_window = Window.partitionBy("#CHROM").orderBy("POS").rowsBetween(Window.unboundedPreceding, 0)
    
    # schema & header
    if index == 1:
        sample_col = gvcf_list[0].columns[7:] + gvcf_list[index].columns[7:]
        header = gvcf_list[0].columns[:5] + ["QUAL", "FILTER"] + gvcf_list[0].columns[5:] + gvcf_list[index].columns[7:] 
    else :
        sample_col = gvcf_combine_result[index - 2].columns[7:] + gvcf_list[index].columns[7:]
        header = gvcf_combine_result[index - 2].columns + gvcf_list[index].columns[7:] 
        
    # null value update
    join_vcf_update = join_vcf.withColumn("INFO", F.last("INFO", ignorenulls = True).over(lookup_window))\
                        .withColumn("INFO_temp", F.last("INFO_temp", ignorenulls = True).over(lookup_window))
    for col_name in sample_col:
        join_vcf_update = join_vcf_update\
            .withColumn(col_name, F.last(col_name, ignorenulls = True).over(lookup_window))
    
    # finally value append
    gvcf_combine_result.append(join_vcf_update.orderBy(F.col("#CHROM"), F.col("POS"))\
                               .rdd.map(lambda row : selectCol(row, sample_col))\
                               .toDF(header).cache())
    gvcf_combine_result[index - 1].count()
    
    join_vcf.unpersist()
    del(join_vcf)
    del(join_vcf_update)
    
    if index != 1:
        gvcf_combine_result[index - 2].unpersist()

In [126]:
gvcf_combine_result[0].filter(F.col("#CHROM") == "chrM").show(100)

+------+----+---+---+------------+----+------+--------------------+------------------+--------------------+--------------------+
|#CHROM| POS| ID|REF|         ALT|QUAL|FILTER|                INFO|            FORMAT|             ND02798|             ND02809|
+------+----+---+---+------------+----+------+--------------------+------------------+--------------------+--------------------+
|  chrM|   1|  .|  G|   <NON_REF>|   .|     .|                   .|GT:DP:GQ:MIN_DP:PL| 0/0:5:15:5:0,15,178|0/0:181:99:71:0,1...|
|  chrM|   2|  .|  A|   <NON_REF>|   .|     .|               END=4|GT:DP:GQ:MIN_DP:PL|0/0:13:39:9:0,27,293|0/0:181:99:71:0,1...|
|  chrM|   5|  .|  A|   <NON_REF>|   .|     .|              END=72|GT:DP:GQ:MIN_DP:PL|0/0:185:99:23:0,6...|0/0:181:99:71:0,1...|
|  chrM|  73|  .|  G| A,<NON_REF>|   .|     .|BaseQRankSum=-2.4...| GT:AD:DP:GQ:PL:SB|1/1:8,211,0:219:9...|0/0:181:99:71:0,1...|
|  chrM|  74|  .|  T|   <NON_REF>|   .|     .|             END=147|GT:DP:GQ:MIN_DP:PL|0/0:232:99:

In [145]:
!hdfs dfs -ls /raw_data/gvcf

Found 10 items
-rw-r--r--   3 root supergroup 5063298294 2020-01-07 20:31 /raw_data/gvcf/ND02798_eg.raw.vcf
-rw-r--r--   3 root supergroup 1944354206 2020-01-07 20:33 /raw_data/gvcf/ND02809_eg.raw.vcf
-rw-r--r--   3 root supergroup  714187321 2020-01-07 20:33 /raw_data/gvcf/ND03490_eg.raw.vcf
-rw-r--r--   3 root supergroup 1220011634 2020-01-07 20:33 /raw_data/gvcf/ND24897_eg.raw.vcf
-rw-r--r--   3 root supergroup 3973345233 2020-01-07 20:33 /raw_data/gvcf/ND25334_eg.raw.vcf
-rw-r--r--   3 root supergroup 1077473701 2020-01-07 20:34 /raw_data/gvcf/ND25335_eg.raw.vcf
-rw-r--r--   3 root supergroup 1326257190 2020-01-07 20:35 /raw_data/gvcf/ND25407_eg.raw.vcf
-rw-r--r--   3 root supergroup 1652938664 2020-01-07 20:36 /raw_data/gvcf/ND25509_eg.raw.vcf
-rw-r--r--   3 root supergroup 1150612426 2020-01-07 20:37 /raw_data/gvcf/ND25659_eg.raw.vcf
-rw-r--r--   3 root supergroup  902281519 2020-01-07 20:38 /raw_data/gvcf/ND25675_eg.raw.vcf


In [156]:
from pyspark.sql.functions import when, otherwise

ImportError: cannot import name 'otherwise' from 'pyspark.sql.functions' (/usr/local/spark/python/pyspark/sql/functions.py)

In [146]:
temp = preVCF(hdfs + "/raw_data/gvcf/ND03490_eg.raw.vcf", 1, spark)

In [182]:
### UDF화
w = Window.partitionBy("#CHROM").orderBy("POS")
sample_w = Window.partitionBy("#CHROM").orderBy("POS").rangeBetween(Window.unboundedPreceding, Window.currentRow)

join_vcf = gvcf_combine_result[index - 2].join(temp, ["#CHROM", "POS", "REF"], "full")\
            .withColumn("ID", when(F.col("ID").isNull(), F.col("ID_temp")).otherwise(F.col("ID")))\
            .withColumn("ALT", when(F.col("ALT").isNull(), F.col("ALT_temp")).otherwise(F.col("ALT")))\
            .withColumn("FORMAT", when(F.col("FORMAT").isNull(), F.col("FORMAT_temp")).otherwise(F.col("FORMAT")))\
            .withColumn("QUAL", when(F.col("QUAL").isNotNull(), "."))\
            .withColumn("FILTER", when(F.col("FILTER").isNotNull(), "."))\
            .withColumn("INFO", when(F.col("INFO").startswith("END") == False, F.col("INFO"))\
                        .when(F.col("INFO_temp").startswith("END") == False, F.col("INFO_temp"))\
                        .otherwise(F.lead("POS", 1).over(w) - 1))\
            .drop("INFO_temp", "ID_temp", "ALT_temp", "FORMAT_temp")\
            .withColumn("ND02798", F.last(F.col("ND02798"), ignorenulls=True).over(sample_w))\
            .withColumn("ND02809", F.last(F.col("ND02809"), ignorenulls=True).over(sample_w))\
            .withColumn("ND03490", F.last(F.col("ND03490"), ignorenulls=True).over(sample_w))\
            .orderBy(F.col("#CHROM"), F.col("POS")).cache()
join_vcf.count()

88660735

In [180]:
join_vcf.unpersist()

DataFrame[#CHROM: string, POS: bigint, REF: string, ID: string, ALT: string, QUAL: string, FILTER: string, INFO: string, FORMAT: string, ND02798: string, ND02809: string, ND03490: string]

In [183]:
join_vcf.filter(F.col("#CHROM") == "chrM").show(5000)

+------+-----+---+---+------------+----+------+--------------------+------------------+--------------------+--------------------+--------------------+
|#CHROM|  POS|REF| ID|         ALT|QUAL|FILTER|                INFO|            FORMAT|             ND02798|             ND02809|             ND03490|
+------+-----+---+---+------------+----+------+--------------------+------------------+--------------------+--------------------+--------------------+
|  chrM|    1|  G|  .|   <NON_REF>|   .|     .|                   .|GT:DP:GQ:MIN_DP:PL| 0/0:5:15:5:0,15,178|0/0:181:99:71:0,1...|    0/0:3:9:2:0,6,73|
|  chrM|    2|  A|  .|   <NON_REF>|   .|     .|                   4|GT:DP:GQ:MIN_DP:PL|0/0:13:39:9:0,27,293|0/0:181:99:71:0,1...|    0/0:3:9:2:0,6,73|
|  chrM|    5|  A|  .|   <NON_REF>|   .|     .|                   8|GT:DP:GQ:MIN_DP:PL|0/0:185:99:23:0,6...|0/0:181:99:71:0,1...|    0/0:3:9:2:0,6,73|
|  chrM|    9|  G|  .|   <NON_REF>|null|  null|                  18|GT:DP:GQ:MIN_DP:PL|0/0:185