In [1]:
import findspark
# $ cd /opt
# /opt$ sudo ln -s ~/apps/spark-3.4.0-bin-hadoop3 spark
findspark.init("/opt/spark") 

In [2]:
# import pyspark
# sc = pyspark.SparkContext(appName="globalContent")
from pathlib import Path
gs_jar = str(Path('~/apps/gcs-connector-hadoop2-2.2.13-shaded.jar').expanduser().resolve())
spark_jars = f"{gs_jar}"

from pyspark.sql import SparkSession
spark = SparkSession.builder \
            .appName("LensFeatures") \
            .config("spark.jars", spark_jars) \
            .getOrCreate()

23/06/01 15:25:39 WARN Utils: Your hostname, VIJAYs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.219.127.110 instead (on interface en0)
23/06/01 15:25:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/06/01 15:25:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# This sections is not required if you are running on Google Cloud Dataproc Serverless
spark.conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark.conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark.conf.set("google.cloud.auth.service.account.enable", "true")
spark.conf.set("google.cloud.auth.service.account.json.keyfile","../.eigen1-vijay-gcp.credentials.json")
spark.conf.set('fs.gs.auth.type','SERVICE_ACCOUNT_JSON_KEYFILE')

In [4]:
# to reduce memory pressure when converting pyspark to pandas
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [10]:
df = spark.read.parquet(f"gs://vijay-lens-ml/predictions/20230522053757_xgbcl/")



In [11]:
df = df.where(
            (df.is_original == 'True') 
            & (df.is_content_warning != 'True')
            & (df.recommend != 'NO')
            )



In [12]:
df.printSchema()

root
 |-- region: string (nullable = true)
 |-- collects: long (nullable = true)
 |-- followship_score: double (nullable = true)
 |-- custom_filters_gardener_flagged: string (nullable = true)
 |-- upvotes: long (nullable = true)
 |-- mirrors: long (nullable = true)
 |-- is_original: string (nullable = true)
 |-- age: long (nullable = true)
 |-- followship_rank: long (nullable = true)
 |-- downvotes: long (nullable = true)
 |-- main_content_focus: string (nullable = true)
 |-- comments: long (nullable = true)
 |-- language: string (nullable = true)
 |-- max_age: long (nullable = true)
 |-- max_mirrors: long (nullable = true)
 |-- max_collects: long (nullable = true)
 |-- max_comments: long (nullable = true)
 |-- post_score: double (nullable = true)
 |-- post_id: string (nullable = true)
 |-- dtime: long (nullable = true)
 |-- recommend: string (nullable = true)



In [13]:
from pyspark.sql.functions import expr
df = df.withColumn("engagement_score", expr("(1 * upvotes) + (3 * mirrors) + (5 * comments)"))

In [14]:
df = df.select("post_id", "recommend", "dtime", "engagement_score" )

In [15]:
from datetime import datetime, timezone
dtime_now = int(datetime.utcnow().replace(microsecond=0).strftime("%Y%m%d%H%M%S"))
dtime_now

20230601224452

In [16]:
from pyspark.sql.functions import col

condn_1day = dtime_now - col("dtime") < 86400
condn_7day = dtime_now - col("dtime") < 604800
condn_30day = dtime_now - col("dtime") < 2592000

In [17]:
from pyspark.sql.functions import when

df = df.withColumn("time_ago",
                   when(condn_1day , "1d") \
                   .when(condn_7day, "7d") \
                   .when(condn_30day, "30d") \
                   .otherwise("99d"))

In [18]:
df.printSchema()

root
 |-- post_id: string (nullable = true)
 |-- recommend: string (nullable = true)
 |-- dtime: long (nullable = true)
 |-- engagement_score: long (nullable = true)
 |-- time_ago: string (nullable = false)



In [19]:
pd_df = df.toPandas()

                                                                                

In [21]:
print('YES', pd_df[pd_df['recommend'] == 'YES']['time_ago'].value_counts())

YES time_ago
99d    154801
7d        311
1d        260
Name: count, dtype: int64


In [22]:
print('MAYBE', pd_df[pd_df['recommend'] == 'MAYBE']['time_ago'].value_counts())

MAYBE time_ago
99d    103358
7d          4
Name: count, dtype: int64


In [25]:
import numpy as np

rng = np.random.default_rng()

samples = []
counts = pd_df['time_ago'].value_counts()

In [26]:
counts

time_ago
99d    258159
7d        315
1d        260
Name: count, dtype: int64

In [29]:
min(counts.get('30d', 0), 0)

0

In [30]:
samples.append(pd_df.loc[pd_df['time_ago'] == '1d'].sample(n=min(counts.get('1d', 0),100), random_state=rng))
samples.append(pd_df.loc[pd_df['time_ago'] == '7d'].sample(n=min(counts.get('7d', 0),50), random_state=rng))
samples.append(pd_df.loc[pd_df['time_ago'] == '30d'].sample(n=min(counts.get('30d', 0),50), random_state=rng))
samples.append(pd_df.loc[pd_df['time_ago'] == '99d'].sample(n=min(counts.get('99d', 0),50), random_state=rng))

In [31]:
import pandas as pd
sample_df = pd.concat(samples)

In [32]:
sample_df['time_ago'].value_counts()

time_ago
1d     100
7d      50
99d     50
Name: count, dtype: int64

In [35]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 256545 to 237821
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   post_id           200 non-null    object
 1   recommend         200 non-null    object
 2   dtime             200 non-null    int64 
 3   engagement_score  200 non-null    int64 
 4   time_ago          200 non-null    object
dtypes: int64(2), object(3)
memory usage: 9.4+ KB


In [36]:
# bin engagement_score with auto-selection of bin boundaries
sample_df['popularity'], bin_cuts = \
                pd.qcut(sample_df['engagement_score'], q = 3, labels = ['C', 'B', 'A'], retbins = True)

In [37]:
sample_df.sample(5)

Unnamed: 0,post_id,recommend,dtime,engagement_score,time_ago,popularity
245155,0x0a20-0x2509,MAYBE,20230524190556,1,99d,C
72417,0x0920-0x02a8,YES,20230524190556,61,99d,A
257992,0x0a20-0x43c0-DA-b2570cd4,YES,20230601020808,1,7d,C
258444,0xb728-0x011f,YES,20230601120820,4,7d,B
256401,0x0b24-0x5b-DA-89313ab7,YES,20230601161129,5,1d,B


In [38]:
bin_cuts

array([  0.,   1.,   7., 201.])

In [39]:
sample_df['popularity'].value_counts()

popularity
C    74
A    66
B    60
Name: count, dtype: int64

In [40]:
sample_df['weights'] = np.where(sample_df['recommend'] == 'YES' ,
                                np.where(sample_df['popularity'] == 'A',
                                        .4, # YES-A
                                        np.where(sample_df['popularity'] == 'B',
                                                 .3, # YES-B
                                                 .1 # YES-C
                                                )
                                        ),
                                .2 # treat all MAYBEs equally
                                )

In [41]:
sample_df['weights'].value_counts()

weights
0.1    64
0.4    57
0.3    57
0.2    22
Name: count, dtype: int64

In [42]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 256545 to 237821
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   post_id           200 non-null    object  
 1   recommend         200 non-null    object  
 2   dtime             200 non-null    int64   
 3   engagement_score  200 non-null    int64   
 4   time_ago          200 non-null    object  
 5   popularity        200 non-null    category
 6   weights           200 non-null    float64 
dtypes: category(1), float64(1), int64(2), object(3)
memory usage: 11.3+ KB


In [43]:
sample_df = sample_df.sample(n=100, weights='weights', random_state=rng)

In [44]:
sample_df['recommend'].value_counts()

recommend
YES      91
MAYBE     9
Name: count, dtype: int64

In [46]:
sample_df['popularity'].value_counts()

popularity
B    42
A    37
C    21
Name: count, dtype: int64

In [47]:
sample_df['time_ago'].value_counts()

time_ago
1d     50
7d     28
99d    22
Name: count, dtype: int64

In [45]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 255967 to 257863
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   post_id           100 non-null    object  
 1   recommend         100 non-null    object  
 2   dtime             100 non-null    int64   
 3   engagement_score  100 non-null    int64   
 4   time_ago          100 non-null    object  
 5   popularity        100 non-null    category
 6   weights           100 non-null    float64 
dtypes: category(1), float64(1), int64(2), object(3)
memory usage: 5.7+ KB


In [48]:
sample_df = sample_df.sort_values(['dtime', 'weights'], ascending=[False, False], ignore_index=True)
sample_df = sample_df[['post_id']]
sample_df['strategy_name'] = "ml-xgb-followship"
sample_df['v'] = sample_df.index

In [49]:
sample_df.head()

Unnamed: 0,post_id,strategy_name,v
0,0xf0ca-0x037b-DA-4ff6c8ed,ml-xgb-followship,0
1,0x2d0e-0x0245,ml-xgb-followship,1
2,0x01a4f5-0x0ce7,ml-xgb-followship,2
3,0x01a4f5-0x0ce6,ml-xgb-followship,3
4,0x2552-0x058a-DA-d079519c,ml-xgb-followship,4


In [50]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   post_id        100 non-null    object
 1   strategy_name  100 non-null    object
 2   v              100 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


In [None]:
import getpass
# 'postgresql://username:password@dbhost:dbport/dbname'
connect_url = getpass.getpass(prompt='Connection URL: ')

In [None]:
# Suppress Warnings
# we do not want to use SQLAlchemy 2 because of incompatibility issues with Pandas
SQLALCHEMY_SILENCE_UBER_WARNING=1

In [None]:
from sqlalchemy import create_engine
db = create_engine(connect_url)

In [None]:
db.execute("DELETE FROM feed WHERE strategy_name = 'ml-xgb-followship'")

In [None]:
sample_df.to_sql('feed', con=db, if_exists='append', index=False)