In [1]:
import findspark
# $ cd /opt
# /opt$ sudo ln -s ~/apps/spark-3.4.0-bin-hadoop3 spark
findspark.init("/opt/spark") 

In [2]:
# import pyspark
# sc = pyspark.SparkContext(appName="globalContent")
from pathlib import Path
gs_jar = str(Path('~/apps/gcs-connector-hadoop2-2.2.13-shaded.jar').expanduser().resolve())
spark_jars = f"{gs_jar}"

from pyspark.sql import SparkSession
spark = SparkSession.builder \
            .appName("LensFeatures") \
            .config("spark.jars", spark_jars) \
            .getOrCreate()

23/05/31 07:15:40 WARN Utils: Your hostname, VIJAYs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.87.20 instead (on interface en0)
23/05/31 07:15:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/05/31 07:15:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# This sections is not required if you are running on Google Cloud Dataproc Serverless
spark.conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark.conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark.conf.set("google.cloud.auth.service.account.enable", "true")
spark.conf.set("google.cloud.auth.service.account.json.keyfile","../.eigen1-vijay-gcp.credentials.json")
spark.conf.set('fs.gs.auth.type','SERVICE_ACCOUNT_JSON_KEYFILE')

In [4]:
# to reduce memory pressure when converting pyspark to pandas
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [5]:
df = spark.read.parquet(f"gs://vijay-lens-ml/predictions/20230522053757_xgbcl/")

                                                                                

In [6]:
df = df.where(f"recommend != 'NO'")

In [7]:
df.printSchema()

root
 |-- region: string (nullable = true)
 |-- collects: long (nullable = true)
 |-- followship_score: double (nullable = true)
 |-- custom_filters_gardener_flagged: string (nullable = true)
 |-- upvotes: long (nullable = true)
 |-- mirrors: long (nullable = true)
 |-- is_original: string (nullable = true)
 |-- age: long (nullable = true)
 |-- followship_rank: long (nullable = true)
 |-- downvotes: long (nullable = true)
 |-- main_content_focus: string (nullable = true)
 |-- comments: long (nullable = true)
 |-- language: string (nullable = true)
 |-- max_age: long (nullable = true)
 |-- max_mirrors: long (nullable = true)
 |-- max_collects: long (nullable = true)
 |-- max_comments: long (nullable = true)
 |-- post_score: double (nullable = true)
 |-- post_id: string (nullable = true)
 |-- dtime: long (nullable = true)
 |-- recommend: string (nullable = true)



In [8]:
df = df.select("post_id", "recommend", "dtime")

In [9]:
from datetime import datetime, timezone
dtime_now = int(datetime.utcnow().replace(microsecond=0).strftime("%Y%m%d%H%M%S"))
dtime_now

20230531141644

In [10]:
from pyspark.sql.functions import col

condn_1day = dtime_now - col("dtime") < 86400
condn_7day = dtime_now - col("dtime") < 604800
condn_30day = dtime_now - col("dtime") < 2592000

In [11]:
from pyspark.sql.functions import when

df = df.withColumn("time_ago",
                   when(condn_1day , "1d") \
                   .when(condn_7day, "7d") \
                   .when(condn_30day, "30d") \
                   .otherwise("99d"))

In [12]:
df.printSchema()

root
 |-- post_id: string (nullable = true)
 |-- recommend: string (nullable = true)
 |-- dtime: long (nullable = true)
 |-- time_ago: string (nullable = false)



In [13]:
pd_df = df.toPandas()

                                                                                

In [84]:
pd_df['time_ago'].value_counts()

time_ago
99d    395028
30d      1600
1d        436
7d        174
Name: count, dtype: int64


In [85]:
print('YES', pd_df[pd_df['recommend'] == 'YES']['time_ago'].value_counts())

YES time_ago
99d    194534
30d      1582
1d        436
7d        151
Name: count, dtype: int64


In [86]:
print('MAYBE', pd_df[pd_df['recommend'] == 'MAYBE']['time_ago'].value_counts())

MAYBE time_ago
99d    200494
7d         23
30d        18
Name: count, dtype: int64


In [68]:
import numpy as np

rng = np.random.default_rng()

samples = []
counts = pd_df['time_ago'].value_counts()

samples.append(pd_df.loc[pd_df['time_ago'] == '1d'].sample(n=min(counts['1d'],100), random_state=rng))
samples.append(pd_df.loc[pd_df['time_ago'] == '7d'].sample(n=min(counts['7d'],50), random_state=rng))
samples.append(pd_df.loc[pd_df['time_ago'] == '30d'].sample(n=min(counts['30d'],50), random_state=rng))
samples.append(pd_df.loc[pd_df['time_ago'] == '99d'].sample(n=min(counts['99d'],50), random_state=rng))

In [69]:
import pandas as pd
sample_df = pd.concat(samples)

In [70]:
sample_df['time_ago'].value_counts()

time_ago
1d     100
7d      50
30d     50
99d     50
Name: count, dtype: int64

In [76]:
sample_df['weights'] = np.where(sample_df['recommend'] == 'YES', .8, .2)

In [77]:
sample_df['weights'].value_counts()

weights
0.8    215
0.2     35
Name: count, dtype: int64

In [78]:
sample_df = sample_df.sample(n=100, weights='weights', random_state=rng)

In [79]:
sample_df['recommend'].value_counts()

recommend
YES      98
MAYBE     2
Name: count, dtype: int64

In [80]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 396569 to 394200
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   post_id    100 non-null    object 
 1   recommend  100 non-null    object 
 2   dtime      100 non-null    int64  
 3   time_ago   100 non-null    object 
 4   weights    100 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.7+ KB


In [81]:
sample_df = sample_df.sort_values(['dtime'], ascending=[False], ignore_index=True)
sample_df = sample_df[['post_id']]
sample_df['strategy_name'] = "ml-xgb-followship"
sample_df['v'] = sample_df.index

In [82]:
sample_df.head()

Unnamed: 0,post_id,strategy_name,v
0,0x01cbef-0x01,ml-xgb-followship,0
1,0x73b1-0x2448-DA-2f8b24b0,ml-xgb-followship,1
2,0x02a6-0x0153,ml-xgb-followship,2
3,0x28a2-0x0693,ml-xgb-followship,3
4,0x01cbd4-0x16,ml-xgb-followship,4


In [83]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   post_id        100 non-null    object
 1   strategy_name  100 non-null    object
 2   v              100 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


In [None]:
import getpass
# 'postgresql://username:password@dbhost:dbport/dbname'
connect_url = getpass.getpass(prompt='Connection URL: ')

In [None]:
# Suppress Warnings
# we do not want to use SQLAlchemy 2 because of incompatibility issues with Pandas
SQLALCHEMY_SILENCE_UBER_WARNING=1

In [None]:
from sqlalchemy import create_engine
db = create_engine(connect_url)

In [None]:
db.execute("DELETE FROM feed WHERE strategy_name = 'ml-xgb-followship'")

In [None]:
sample_df.to_sql('feed', con=db, if_exists='append', index=False)