In [None]:
%spark.pyspark

import sys

from pprint import pprint

# each JSON is small, there's no need in iterative processing
import json
import sys
import os
import xml
import time

import pyspark
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, FloatType, ArrayType
import pyspark.sql.functions as sparkf
from pyspark.sql.functions import pandas_udf, PandasUDFType

import copy
import uuid

spark = (pyspark.sql.SparkSession.builder.getOrCreate())

coauthor_dir        = "gs://clpub/data_lake/arnet/tables/coauthor/merge-0"
author_org_rank_dir = "gs://clpub/data_lake/arnet/tables/author_org_rank/merge-0"
org_rank_dir        = "gs://clpub/data_lake/arnet/tables/org_rank_algo/iter-14"

from typing import Iterator, Tuple
import pandas as pd

org_node_schema = StructType([
    StructField("_1", FloatType(), False),
    StructField("_2", IntegerType(), False),
    StructField("_3", IntegerType(), False),
    StructField("_4", LongType(), False),
    StructField("_5", FloatType(), False),
])
org_rank_schema = StructType([
    StructField("id", LongType(), False),
    StructField("node", org_node_schema, False),
])



author_org_ranking_schema = StructType([
    StructField('author_id', StringType(), False),
    StructField("author_org", StringType(), False),
    StructField('org_rank', FloatType(), False),
    StructField('computed', IntegerType(), False),
])
author_org_rank_df = spark.read.schema(author_org_ranking_schema).parquet(author_org_rank_dir)
author_org_rank_df.createOrReplaceTempView("author_org_ranking_df")

group_coauthor = spark.sql("""
    select author1_id, author2_id, count(_id) as collab
    from coauthor_df
    group by author1_id, author2_id
""")

sample = group_coauthor.sample(0.01, 999)
sample.createOrReplaceTempView("coauthor_sample")

sample_ranking = spark.sql("""
    select cs.collab, aord1.org_rank as author1_ranking, aord2.org_rank as author2_ranking
    from coauthor_sample as cs
        inner join author_org_ranking_df as aord1 on aord1.author_id = cs.author1_id
        inner join author_org_ranking_df as aord2 on aord2.author_id = cs.author2_id
    where aord1.computed = 1 and aord2.computed = 1 and aord1.author_org != aord2.author_org
        and aord1.author_org != "" and aord2.author_org != ""
    limit 2000
""")

from scipy.spatial import distance
import numpy as np

@pandas_udf("float", PandasUDFType.SCALAR)
def node_proximity(v1, v2):
    list_r1     =  v1.values.tolist()
    list_r2     =  v2.values.tolist() \
    
    list_res    = []
    for idx in range(0, len(list_r1)):
        proximity = abs(list_r1[idx] - list_r2[idx]) / abs(list_r1[idx] + list_r2[idx])
        list_res.append(proximity) \
    
    return pd.Series(list_res)

sample_proximity = sample_ranking.repartition(1).withColumn("datapoint", sparkf.monotonically_increasing_id()) \
    .select(sparkf.col("datapoint"), sparkf.col("collab"), sparkf.col("author1_ranking"), sparkf.col("author2_ranking"), \
    node_proximity(sparkf.col("author1_ranking"), sparkf.col("author2_ranking")).alias("node_proximity"))

sample_proximity.repartition(1).write.mode("overwrite").csv("gs://clpub/diagram/org_rank_vs_coauthor_L1_distance")
