In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
import pyspark.sql.functions as sparkf

spark = SparkSession.builder.getOrCreate()

coauthor_dir = "gs://clpub/data_lake/arnet/tables/coauthor/merge-0"
author_activ_dir = "gs://clpub/data_lake/arnet/tables/author_activity/merge-Wnd-2"

coauthor_schema = StructType([
    StructField('_id', StringType(), False),
    StructField('_status', IntegerType(), False),
    StructField('_order', IntegerType(), False),
    StructField('paper_id', StringType(), False),
    StructField('paper_title', StringType(), False),
    StructField('author1_id', StringType(), False),
    StructField('author1_name', StringType(), False),
    StructField('author1_org', StringType(), False),
    StructField('author2_id', StringType(), False),
    StructField('author2_name', StringType(), False),
    StructField('author2_org', StringType(), False),
    StructField('year', FloatType(), False),
])

author_activ_schema = StructType([
    StructField('author_id', StringType(), False),
    StructField('freq', FloatType(), False),
])

coauthor_df = spark.read.schema(coauthor_schema).parquet(coauthor_dir)

author_activ_df = spark.read.schema(author_activ_schema).parquet(author_activ_dir)
author_activ_df.createOrReplaceTempView("author_activ_df")

sample_coauthor = coauthor_df.sample(0.1, 999)
sample_coauthor.createOrReplaceTempView("sample_coauth")

join_freq_df = spark.sql("""
    select sc.author1_id, sc.author2_id, aad1.freq as freq1, aad2.freq as freq2
    from sample_coauth as sc
        inner join author_activ_df as aad1 on aad1.author_id = sc.author1_id
        inner join author_activ_df as aad2 on aad2.author_id = sc.author2_id
    limit 5000
""")

import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType
import numpy as np

def avg_freq_func(v1: pd.Series, v2: pd.Series) -> pd.Series:
    v1_log = np.log(v1 + 10.)
    v2_log = np.log(v2 + 10.)
    return ((v1_log - v2_log).abs()) / (pd.concat([v1_log, v2_log], axis=1).max(axis=1))

avg_freq = pandas_udf(avg_freq_func, returnType=FloatType())

sample_avg = join_freq_df.repartition(1).withColumn("datapoint", sparkf.monotonically_increasing_id()).select(
    sparkf.col("datapoint"), avg_freq(sparkf.col("freq1"), sparkf.col("freq2")).alias("node_proximity"))

sample_avg.repartition(1).write.mode("overwrite").csv("gs://clpub/diagram/activity_vs_coauthor_node_proximity/window_2")
