In [None]:
%spark.pyspark

import sys

from pprint import pprint

# each JSON is small, there's no need in iterative processing
import json
import sys
import os
import xml
import time

import pyspark
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, FloatType, ArrayType
import pyspark.sql.functions as sparkf
from pyspark.sql.functions import pandas_udf, PandasUDFType

import copy
import uuid

spark = (pyspark.sql.SparkSession.builder.getOrCreate())

citation_rwr_iter1_dir       = "gs://clpub/data_lake/arnet/tables/citation_rwr/iter-0"
citation_rwr_iter2_dir       = "gs://clpub/data_lake/arnet/tables/citation_rwr/iter-1"
citation_rwr_iter5_dir       = "gs://clpub/data_lake/arnet/tables/citation_rwr/iter-4"
citation_rwr_iter10_dir       = "gs://clpub/data_lake/arnet/tables/citation_rwr/iter-9"
citation_rwr_iter11_dir       = "gs://clpub/data_lake/arnet/tables/citation_rwr/iter-10"
citation_rwr_iter12_dir       = "gs://clpub/data_lake/arnet/tables/citation_rwr/iter-11"

from typing import Iterator, Tuple

rwr_iter1 = spark.read.parquet(citation_rwr_iter1_dir)
rwr_iter2 = spark.read.parquet(citation_rwr_iter2_dir)
rwr_iter5 = spark.read.parquet(citation_rwr_iter5_dir)
rwr_iter10 = spark.read.parquet(citation_rwr_iter10_dir)
rwr_iter11 = spark.read.parquet(citation_rwr_iter11_dir)
rwr_iter12 = spark.read.parquet(citation_rwr_iter12_dir)

rwr_iter1.createOrReplaceTempView("rwr_iter1_df")
rwr_iter2.createOrReplaceTempView("rwr_iter2_df")
rwr_iter5.createOrReplaceTempView("rwr_iter5_df")
rwr_iter10.createOrReplaceTempView("rwr_iter10_df")
rwr_iter11.createOrReplaceTempView("rwr_iter11_df")
rwr_iter12.createOrReplaceTempView("rwr_iter12_df")

sample = rwr_iter1.sample(0.01).limit(50).repartition(1).withColumn(
    "datapoint", sparkf.monotonically_increasing_id())
sample.createOrReplaceTempView("rwr_sample")

sample_deviation = spark.sql("""
    select rs.datapoint, 
        rs.node._1 as rank1, 
        r2d.node._1 as rank2, 
        r5d.node._1 as rank5,
        r10d.node._1 as rank10,
        r11d.node._1 as rank11,
        r12d.node._1 as rank12
    from rwr_sample as rs
        inner join rwr_iter2_df as r2d on rs.id = r2d.id
        inner join rwr_iter5_df as r5d on rs.id = r5d.id
        inner join rwr_iter10_df as r10d on rs.id = r10d.id
        inner join rwr_iter11_df as r11d on rs.id = r11d.id
        inner join rwr_iter12_df as r12d on rs.id = r12d.id
""")

sample_deviation.repartition(1).write.mode("overwrite").csv("gs://clpub/diagram/rwr_distribution_changes")


In [None]:
%spark.pyspark

import pandas as pd
import seaborn as sns

schema = StructType([
    StructField("datapoint", LongType(), False),
    StructField("rank1", FloatType(), False),
    StructField("rank2", FloatType(), False),
    StructField("rank5", FloatType(), False),
    StructField("rank10", FloatType(), False),
    StructField("rank11", FloatType(), False),
    StructField("rank12", FloatType(), False),
])

df = spark.read.schema(schema).csv("gs://clpub/diagram/rwr_distribution_changes")
df.createOrReplaceTempView("data")