In [None]:
%spark.pyspark

import sys

from pprint import pprint

# each JSON is small, there's no need in iterative processing
import json
import sys
import os
import xml
import time

import pyspark
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, FloatType, ArrayType
import pyspark.sql.functions as sparkf
from pyspark.sql.functions import pandas_udf, PandasUDFType

import copy
import uuid

spark = (pyspark.sql.SparkSession.builder.getOrCreate())

coauthor_dir        = "gs://clpub/data_lake/arnet/tables/coauthor/merge-0"
author_rwr_dir      = "gs://clpub/data_lake/arnet/tables/author_rwr_bias/merge-0"

from typing import Iterator, Tuple
import pandas as pd

coauthor_schema = StructType([
    StructField('_id', StringType(), False),
    StructField('_status', IntegerType(), False),
    StructField('_order', IntegerType(), False),
    StructField('paper_id', StringType(), False),
    StructField('paper_title', StringType(), False),
    StructField('author1_id', StringType(), False),
    StructField('author1_name', StringType(), False),
    StructField('author1_org', StringType(), False),
    StructField('author2_id', StringType(), False),
    StructField('author2_name', StringType(), False),
    StructField('author2_org', StringType(), False),
    StructField('year', FloatType(), False),
])
coauthor_df = spark.read.schema(coauthor_schema).parquet(coauthor_dir)
coauthor_df.createOrReplaceTempView("coauthor_df")
coauthor_df.count()

author_rwr_schema = StructType([
    StructField('author_id', StringType(), False),
    StructField('ranking', FloatType(), False),
    StructField('computed', IntegerType(), False),
])
author_rwr_df = spark.read.schema(author_rwr_schema).parquet(author_rwr_dir)
author_rwr_df.createOrReplaceTempView("author_rwr_df")

group_coauthor = spark.sql("""
    select author1_id, author2_id, count(_id) as collab
    from coauthor_df
    group by author1_id, author2_id
""")

sample = group_coauthor.sample(0.01, 999).limit(3000).repartition(1).withColumn(
    "datapoint", sparkf.monotonically_increasing_id())
sample.createOrReplaceTempView("coauthor_sample")

sample_ranking = spark.sql("""
    select cs.datapoint, cs.collab, ard1.ranking as author1_ranking, ard2.ranking as author2_ranking
    from coauthor_sample as cs
        inner join author_rwr_df as ard1 on ard1.author_id = cs.author1_id
        inner join author_rwr_df as ard2 on ard2.author_id = cs.author2_id
    where ard1.computed = 1 and ard2.computed = 1
    order by cs.datapoint
    limit 2000
""")

from scipy.spatial import distance
import numpy as np

@pandas_udf("float", PandasUDFType.SCALAR)
def node_proximity(v1, v2):
    list_r1     =  v1.values.tolist()
    list_r2     =  v2.values.tolist() \
    
    list_res    = []
    for idx in range(0, len(list_r1)):
        proximity =  \
            abs(list_r1[idx] - list_r2[idx]) \
            / max(abs(list_r1[idx]), abs(list_r2[idx])) \

        list_res.append(proximity) \
    
    return pd.Series(list_res)

sample_proximity = sample_ranking.select(sparkf.col("datapoint"), sparkf.col("collab"), \
    node_proximity(sparkf.col("author1_ranking"), sparkf.col("author2_ranking")).alias("node_proximity"))

sample_proximity.repartition(1).write.mode("overwrite").csv("gs://clpub/diagram/rwr_bias_vs_coauthor")


In [None]:
%spark.pyspark

df_schema = StructType([
    StructField("datapoint", IntegerType(), False),
    StructField("collab", LongType(), False),
    StructField("node_proximity", FloatType(), False),
])

df = spark.read.schema(df_schema).csv("gs://clpub/diagram/rwr_bias_vs_coauthor")
df.createOrReplaceTempView("data")