In [1]:
import findspark
# $ cd /opt
# /opt$ sudo ln -s ~/apps/spark-3.4.0-bin-hadoop3 spark
findspark.init("/opt/spark") 

In [2]:
# import pyspark
# sc = pyspark.SparkContext(appName="globalContent")
from pathlib import Path
pg_jar = str(Path('~/apps/postgresql-42.5.4.jar').expanduser().resolve())
gs_jar = str(Path('~/apps/gcs-connector-hadoop2-2.2.13-shaded.jar').expanduser().resolve())
# bq_jar = str(Path('~/apps/spark-bigquery-with-dependencies_2.12-0.30.0.jar').expanduser().resolve())
spark_jars = f"{pg_jar},{gs_jar}"

from pyspark.sql import SparkSession
spark = SparkSession.builder \
            .appName("LensFeatures") \
            .config("spark.jars", spark_jars) \
            .getOrCreate()

23/05/24 23:00:01 WARN Utils: Your hostname, VIJAYs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.87.20 instead (on interface en0)
23/05/24 23:00:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/05/24 23:00:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# This sections is not required if you are running on Google Cloud Dataproc Serverless
spark.conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark.conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark.conf.set("google.cloud.auth.service.account.enable", "true")
spark.conf.set("google.cloud.auth.service.account.json.keyfile","../.eigen1-vijay-gcp.credentials.json")
spark.conf.set('fs.gs.auth.type','SERVICE_ACCOUNT_JSON_KEYFILE')

In [4]:
df = spark.read.parquet(f"gs://vijay-lens-ml/predictions/20230522053757_xgbcl/")

                                                                                

In [5]:
df = df.where(f"recommend != 'NO'")

In [6]:
print(f"total number of records ${df.count()}")



total number of records $391650


                                                                                

In [7]:
df.printSchema()

root
 |-- region: string (nullable = true)
 |-- collects: long (nullable = true)
 |-- followship_score: double (nullable = true)
 |-- custom_filters_gardener_flagged: string (nullable = true)
 |-- upvotes: long (nullable = true)
 |-- mirrors: long (nullable = true)
 |-- is_original: string (nullable = true)
 |-- age: long (nullable = true)
 |-- followship_rank: long (nullable = true)
 |-- downvotes: long (nullable = true)
 |-- main_content_focus: string (nullable = true)
 |-- comments: long (nullable = true)
 |-- language: string (nullable = true)
 |-- max_age: long (nullable = true)
 |-- max_mirrors: long (nullable = true)
 |-- max_collects: long (nullable = true)
 |-- max_comments: long (nullable = true)
 |-- post_score: double (nullable = true)
 |-- post_id: string (nullable = true)
 |-- dtime: long (nullable = true)
 |-- recommend: string (nullable = true)



In [8]:
df = df.select("post_id", "recommend")

In [9]:
total_yes = df.select('post_id').where(df.recommend == 'YES').count()
total_maybe = df.select('post_id').where(df.recommend == 'YES').count()

                                                                                

In [10]:
# we need 100 rows but sampling sometimes returns less than 100; start with 120
num_yes = 0.8*120
num_maybe = 0.2*120

yes_fraction = round(num_yes / total_yes, 10)
print(f"total_yes:{total_yes} yes_fraction:{yes_fraction}")

maybe_fraction = round(num_maybe / total_maybe, 10)
print(f"total_maybe:{total_maybe} maybe_fraction:{maybe_fraction}")

total_yes:193237 yes_fraction:0.0004967993
total_maybe:193237 maybe_fraction:0.0001241998


In [11]:
sample_df = df.sampleBy("recommend", fractions={'YES': yes_fraction, 'MAYBE': maybe_fraction}, seed=0)

In [12]:
print(f"total number of sampled records ${sample_df.count()}")



total number of sampled records $119




In [13]:
from pyspark.sql.functions import lit, monotonically_increasing_id
sample_df = sample_df.select(
                        lit("ml-xgb-followship").alias("strategy_name"), # "EigenTrust + ML"
                        "post_id", 
                        monotonically_increasing_id().alias('v'))

In [14]:
sample_df.printSchema()

root
 |-- strategy_name: string (nullable = false)
 |-- post_id: string (nullable = true)
 |-- v: long (nullable = false)



In [15]:
sample_df.head(5)

                                                                                

[Row(strategy_name='ml-xgb-followship', post_id='0x51fe-0x0599', v=0),
 Row(strategy_name='ml-xgb-followship', post_id='0xd523-0x0287', v=1),
 Row(strategy_name='ml-xgb-followship', post_id='0x0100c0-0x031c', v=2),
 Row(strategy_name='ml-xgb-followship', post_id='0x012751-0xbd', v=3),
 Row(strategy_name='ml-xgb-followship', post_id='0x01adad-0xfa', v=4)]

In [16]:
import getpass
# 'jdbc:postgresql://dbhost:dbport/dbname?user=username&password=secret'
jdbc_url = getpass.getpass(prompt='JDBC URL: ')

JDBC URL: ········


In [18]:
sample_df.limit(100).write.format("jdbc")\
    .option("url", jdbc_url) \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "feed") \
    .mode("overwrite") \
    .save()

                                                                                