In [27]:
import findspark
findspark.init()

# Spark & python function
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf, col, desc, asc, coalesce, broadcast
from pyspark import Row

spark = SparkSession.builder.master("spark://master:7077")\
                        .appName("gVCF_combine")\
                        .config("spark.executor.memory", "18G")\
                        .config("spark.executor.core", "3")\
                        .config("spark.sql.shuffle.partitions", 20)\
                        .config("spark.driver.maxResultSize", "10G")\
                        .getOrCreate()

# .config("spark.daemon.memory", "14G")\

In [26]:
spark.catalog.clearCache()
spark.stop()

In [4]:
!hdfs dfs -ls /gvcf

Found 18 items
-rw-r--r--   3 root supergroup 5063298294 2019-12-30 14:01 /gvcf/ND02798_eg.raw.vcf
-rw-r--r--   3 root supergroup     199953 2019-12-30 14:01 /gvcf/ND02798_eg.raw.vcf.idx
-rw-r--r--   3 root supergroup 1944354206 2019-12-30 14:01 /gvcf/ND02809_eg.raw.vcf
-rw-r--r--   3 root supergroup     199945 2019-12-30 14:01 /gvcf/ND02809_eg.raw.vcf.idx
-rw-r--r--   3 root supergroup  714187321 2019-12-30 14:01 /gvcf/ND03490_eg.raw.vcf
-rw-r--r--   3 root supergroup     199945 2019-12-30 14:01 /gvcf/ND03490_eg.raw.vcf.idx
-rw-r--r--   3 root supergroup 1220011634 2019-12-30 14:01 /gvcf/ND24897_eg.raw.vcf
-rw-r--r--   3 root supergroup     199953 2019-12-30 14:01 /gvcf/ND24897_eg.raw.vcf.idx
-rw-r--r--   3 root supergroup 1077473701 2019-12-30 14:01 /gvcf/ND25335_eg.raw.vcf
-rw-r--r--   3 root supergroup     199929 2019-12-30 14:01 /gvcf/ND25335_eg.raw.vcf.idx
-rw-r--r--   3 root supergroup 1326257190 2019-12-30 14:02 /gvcf/ND25407_eg.raw.vcf
-rw-r--r--   3 root supergrou

In [28]:
import re
### data1
hdfs = "hdfs://master:9000/gvcf/ND02798_eg.raw.vcf"
vcf_left = spark.sparkContext.textFile(hdfs)

header_contig_left = vcf_left.filter(lambda x : re.match("^#", x))
col_name_left = vcf_left.filter(lambda x : x.startswith("#CHROM")).first().split("\t")

vcf_data_left = vcf_left.filter(lambda x : re.match("[^#][^#]", x)).map(lambda x : x.split("\t")).toDF(col_name_left)\
                        .withColumn("POS", col("POS").cast(IntegerType()))

In [29]:
vcf_data_left.printSchema()

root
 |-- #CHROM: string (nullable = true)
 |-- POS: integer (nullable = true)
 |-- ID: string (nullable = true)
 |-- REF: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- QUAL: string (nullable = true)
 |-- FILTER: string (nullable = true)
 |-- INFO: string (nullable = true)
 |-- FORMAT: string (nullable = true)
 |-- ND02798: string (nullable = true)



In [30]:
hdfs = "hdfs://master:9000/gvcf/ND02809_eg.raw.vcf"
vcf_right = spark.sparkContext.textFile(hdfs)

header_contig_right = vcf_right.filter(lambda x : re.match("^#", x))
col_name_right = vcf_right.filter(lambda x : x.startswith("#CHROM")).first().split("\t")

vcf_data_right = vcf_right.filter(lambda x : re.match("[^#][^#]", x)).map(lambda x : x.split("\t")).toDF(col_name_right)\
                        .withColumn("POS", col("POS").cast(IntegerType()))

In [31]:
vcf_data_right.printSchema()

root
 |-- #CHROM: string (nullable = true)
 |-- POS: integer (nullable = true)
 |-- ID: string (nullable = true)
 |-- REF: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- QUAL: string (nullable = true)
 |-- FILTER: string (nullable = true)
 |-- INFO: string (nullable = true)
 |-- FORMAT: string (nullable = true)
 |-- ND02809: string (nullable = true)



In [32]:
temp = vcf_data_left.join(vcf_data_right, ["#CHROM", "POS", "REF"], "inner").cache()

In [33]:
temp.explain()

== Physical Plan ==
InMemoryTableScan [#CHROM#1230, POS#1250, REF#1233, ID#1232, ALT#1234, QUAL#1235, FILTER#1236, INFO#1237, FORMAT#1238, ND02798#1239, ID#1263, ALT#1265, QUAL#1266, FILTER#1267, INFO#1268, FORMAT#1269, ND02809#1270]
   +- InMemoryRelation [#CHROM#1230, POS#1250, REF#1233, ID#1232, ALT#1234, QUAL#1235, FILTER#1236, INFO#1237, FORMAT#1238, ND02798#1239, ID#1263, ALT#1265, QUAL#1266, FILTER#1267, INFO#1268, FORMAT#1269, ND02809#1270], StorageLevel(disk, memory, deserialized, 1 replicas)
         +- *(5) Project [#CHROM#1230, POS#1250, REF#1233, ID#1232, ALT#1234, QUAL#1235, FILTER#1236, INFO#1237, FORMAT#1238, ND02798#1239, ID#1263, ALT#1265, QUAL#1266, FILTER#1267, INFO#1268, FORMAT#1269, ND02809#1270]
            +- *(5) SortMergeJoin [#CHROM#1230, POS#1250, REF#1233], [#CHROM#1261, POS#1281, REF#1264], Inner
               :- *(2) Sort [#CHROM#1230 ASC NULLS FIRST, POS#1250 ASC NULLS FIRST, REF#1233 ASC NULLS FIRST], false, 0
               :  +- Exchange hashpartitio

In [34]:
temp.printSchema()

root
 |-- #CHROM: string (nullable = true)
 |-- POS: integer (nullable = true)
 |-- REF: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- QUAL: string (nullable = true)
 |-- FILTER: string (nullable = true)
 |-- INFO: string (nullable = true)
 |-- FORMAT: string (nullable = true)
 |-- ND02798: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- QUAL: string (nullable = true)
 |-- FILTER: string (nullable = true)
 |-- INFO: string (nullable = true)
 |-- FORMAT: string (nullable = true)
 |-- ND02809: string (nullable = true)



In [None]:
temp.count()

In [25]:
temp.show(100)

+------+-------+---+----------+-----------+-------+------+--------------------+------------------+--------------------+----------+-----------+-------+------+--------------------+------------------+--------------------+
|#CHROM|    POS|REF|        ID|        ALT|   QUAL|FILTER|                INFO|            FORMAT|             ND02798|        ID|        ALT|   QUAL|FILTER|                INFO|            FORMAT|             ND02809|
+------+-------+---+----------+-----------+-------+------+--------------------+------------------+--------------------+----------+-----------+-------+------+--------------------+------------------+--------------------+
|  chr1|  14930|  A|rs75454623|G,<NON_REF>| 509.77|     .|BaseQRankSum=-2.3...| GT:AD:DP:GQ:PL:SB|0/1:25,25,0:50:99...|rs75454623|G,<NON_REF>| 183.77|     .|BaseQRankSum=-3.2...| GT:AD:DP:GQ:PL:SB|0/1:69,18,0:87:99...|
|  chr1|  15211|  T|rs78601809|G,<NON_REF>|  23.55|     .|DB;DP=2;MLEAC=2,0...| GT:AD:DP:GQ:PL:SB|1/1:0,2,0:2:6:55,...|     

In [105]:
### indel 개수
temp.rdd.filter(lambda row : len(row.REF) >= 2).count()

51135

In [83]:
vcf_data.toDF(col_name).count()

59996355

In [81]:
vcf_data.toDF(col_name).filter(col("#CHROM") == "chr1").count()

5018020

In [82]:
vcf_data.toDF(col_name).filter(col("#CHROM") == "chr2").count()

4918094

In [84]:
vcf_data.toDF(col_name).filter(col("#CHROM") == "chr3").count()

3866509

In [85]:
vcf_data.toDF(col_name).filter(col("#CHROM") == "chr4").count()

3329637

In [86]:
vcf_data.toDF(col_name).filter(col("#CHROM") == "chr5").count()

3438877

In [87]:
vcf_data.toDF(col_name).filter(col("#CHROM") == "chr6").count()

3348499

In [None]:
def preVCF(hdfs, flag, spark): # hdfs://, flag 0 == lhs, 1 == rhs
    vcf = spark.sparkContext.textFile(hdfs).map(lambda x : x.split("\t"))
    header = vcf.first()
    step1 = vcf.filter(lambda row : row != header).map(alt_filter).toDF(header)
    return_vcf = step1.select(chr_remove_udf(step1["#CHROM"]).cast("Integer").alias("CHROM"), "*")\
                      .drop(col("#CHROM")).filter(col("FILTER") == "PASS")
    if flag == 1:
        for index in range(len(return_vcf.columns[:9])):
            return_vcf = return_vcf.withColumnRenamed(return_vcf.columns[index], return_vcf.columns[index] + "_temp") 
    return return_vcf.coalesce(20)