In [1]:
import findspark
findspark.init()
import os
import time
import datetime
import pyspark.sql.functions as sf
from uuid import *
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import when
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.sql.functions import lit
from pyspark import SparkConf, SparkContext
from uuid import * 
from uuid import UUID
import time_uuid 
from pyspark.sql import Row
from pyspark.sql.functions import udf
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.window import Window as W

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession.builder.config("spark.jars.packages",'com.datastax.spark:spark-cassandra-connector_2.12:3.1.0').getOrCreate()

In [4]:
df = spark.read.format("org.apache.spark.sql.cassandra").options(table='tracking',keyspace='mydata').load()

In [5]:
df.show(5)
print(df.count())

+--------------------+----+----------+-----------+---+------------+-----+--------------------+---------------+--------------------+---+--------+----+------+----+------------+----+---------+--------------------+----+--------------------+-------------------+------------+-----------+----------+----------+--------+---+--------+
|         create_time| bid|        bn|campaign_id| cd|custom_track|   de|                  dl|             dt|                  ed| ev|group_id|  id|job_id|  md|publisher_id|  rl|       sr|                  ts|  tz|                  ua|                uid|utm_campaign|utm_content|utm_medium|utm_source|utm_term|  v|      vp|
+--------------------+----+----------+-----------+---+------------+-----+--------------------+---------------+--------------------+---+--------+----+------+----+------------+----+---------+--------------------+----+--------------------+-------------------+------------+-----------+----------+----------+--------+---+--------+
|b9735a90-fea0-11e...|

In [6]:
def process_timeuuid(df):
    spark_time = df.select('create_time').collect()
    normal_time = []
    for i in range(len(spark_time)):
        a = time_uuid.TimeUUID(bytes = UUID(spark_time[i][0]).bytes).get_datetime().strftime('%Y-%m-%d %H:%M:%S')
        normal_time.append(a)
    spark_timeuuid = []
    for i in range(len(spark_time)):
        spark_timeuuid.append(spark_time[i][0])
    time_data = spark.createDataFrame(zip(spark_timeuuid,normal_time),['create_time','ts'])
    result = df.join(time_data,['create_time'],'inner').drop(df.ts)
    result = result.select('create_time','ts','bid','job_id','campaign_id','custom_track','group_id','publisher_id')
    return result

In [7]:
process_df=process_timeuuid(df)

In [8]:
process_df.show()

+--------------------+-------------------+----+------+-----------+------------+--------+------------+
|         create_time|                 ts| bid|job_id|campaign_id|custom_track|group_id|publisher_id|
+--------------------+-------------------+----+------+-----------+------------+--------+------------+
|b70e2aa0-0986-11e...|2022-07-22 06:22:51|null|  null|       null|        null|    null|        null|
|cb3ade10-018e-11e...|2022-07-12 03:00:32|null|  null|       null|       click|    null|        null|
|34c02ee0-00ee-11e...|2022-07-11 07:51:00|null|  null|       null|        null|    null|        null|
|ba5be830-0d7a-11e...|2022-07-27 07:07:08|null|  null|       null|        null|    null|        null|
|b12df000-0d61-11e...|2022-07-27 04:07:55|   2|    98|          4|       click|    null|           1|
|8ce37380-0caf-11e...|2022-07-26 06:52:44|   2|    98|          4|       click|    null|           1|
|bc1f7ce0-fea2-11e...|2022-07-08 09:45:43|   2|    98|          4|        null|   

In [9]:
process_df.cache()

DataFrame[create_time: string, ts: string, bid: int, job_id: int, campaign_id: int, custom_track: string, group_id: int, publisher_id: int]

In [32]:
def calculating_clicks(process_df):
    click_data = process_df.filter(process_df.custom_track == 'click')
    click_data = click_data.na.fill({'bid':0})
    click_data = click_data.na.fill({'job_id':0})
    click_data = click_data.na.fill({'campaign_id':0})
    click_data = click_data.na.fill({'group_id':0})
    click_data = click_data.na.fill({'publisher_id':0})
    click_data.registerTempTable('clicks')
    clicks_output=spark.sql("""with cte1 as (select create_time , ts , date(ts) as Date , hour(ts) as hour,
    bid,job_id,campaign_id,group_id,publisher_id from clicks)
    select job_id,publisher_id,date,hour,campaign_id,group_id ,
    sum(bid) as spend_hour , count(create_time) as clicks , avg(bid) as bid_set 
    from cte1
    group by job_id,publisher_id,date,hour,campaign_id,group_id""")
    return clicks_output
