In [1]:
import findspark
findspark.init()
import os
import time
import datetime
import pyspark.sql.functions as sf
from uuid import *
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import when
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.sql.functions import lit
from pyspark import SparkConf, SparkContext
from uuid import * 
from uuid import UUID
import time_uuid 
from pyspark.sql import Row
from pyspark.sql.functions import udf
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.window import Window as W

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession.builder.config("spark.jars.packages",'com.datastax.spark:spark-cassandra-connector_2.12:3.1.0').getOrCreate()

In [4]:
df = spark.read.format("org.apache.spark.sql.cassandra").options(table='tracking',keyspace='mydata').load()

In [5]:
df.show(5)
print(df.count())

+--------------------+----+----------+-----------+---+------------+-----+--------------------+---------------+--------------------+---+--------+----+------+----+------------+--------------------+---------+--------------------+----+--------------------+-------------------+------------+-----------+----------+----------+--------+---+--------+
|         create_time| bid|        bn|campaign_id| cd|custom_track|   de|                  dl|             dt|                  ed| ev|group_id|  id|job_id|  md|publisher_id|                  rl|       sr|                  ts|  tz|                  ua|                uid|utm_campaign|utm_content|utm_medium|utm_source|utm_term|  v|      vp|
+--------------------+----+----------+-----------+---+------------+-----+--------------------+---------------+--------------------+---+--------+----+------+----+------------+--------------------+---------+--------------------+----+--------------------+-------------------+------------+-----------+----------+--------

In [6]:
def process_timeuuid(df):
    spark_time = df.select('create_time').collect()
    normal_time = []
    for i in range(len(spark_time)):
        a = time_uuid.TimeUUID(bytes = UUID(spark_time[i][0]).bytes).get_datetime().strftime('%Y-%m-%d %H:%M:%S')
        normal_time.append(a)
    spark_timeuuid = []
    for i in range(len(spark_time)):
        spark_timeuuid.append(spark_time[i][0])
    time_data = spark.createDataFrame(zip(spark_timeuuid,normal_time),['create_time','ts'])
    result = df.join(time_data,['create_time'],'inner').drop(df.ts)
    result = result.select('create_time','ts','bid','job_id','campaign_id','custom_track','group_id','publisher_id')
    return result

In [7]:
process_df=process_timeuuid(df)

In [8]:
process_df.show()

+--------------------+-------------------+----+------+-----------+------------+--------+------------+
|         create_time|                 ts| bid|job_id|campaign_id|custom_track|group_id|publisher_id|
+--------------------+-------------------+----+------+-----------+------------+--------+------------+
|e73f4180-08a2-11e...|2022-07-21 03:12:07|null|  null|       null|        null|    null|        null|
|cd00c920-0c93-11e...|2022-07-26 03:34:05|null|  null|       null|       alive|    null|        null|
|853fd7e0-0890-11e...|2022-07-21 01:00:32|null|  null|       null|        null|    null|        null|
|8d5c0400-0d61-11e...|2022-07-27 04:06:55|   2|    98|          4|        null|    null|           1|
|2bd39d80-08cd-11e...|2022-07-21 08:14:41|null|  null|       null|        null|    null|        null|
|360f0f50-035a-11e...|2022-07-14 09:49:10|null|  null|       null|        null|    null|        null|
|83167df0-00ea-11e...|2022-07-11 07:24:33|null|  null|       null|       click|   

In [9]:
process_df.cache()

DataFrame[create_time: string, ts: string, bid: int, job_id: int, campaign_id: int, custom_track: string, group_id: int, publisher_id: int]

In [10]:
click_data = process_df.filter(process_df.custom_track == 'click')

In [11]:
click_data.show()

+--------------------+-------------------+----+------+-----------+------------+--------+------------+
|         create_time|                 ts| bid|job_id|campaign_id|custom_track|group_id|publisher_id|
+--------------------+-------------------+----+------+-----------+------------+--------+------------+
|4e51fdb0-089c-11e...|2022-07-21 02:24:54|null|  null|       null|       click|    null|        null|
|5a374880-0b38-11e...|2022-07-24 10:06:57|   1|   188|         48|       click|      34|           1|
|6140b730-0668-11e...|2022-07-18 07:08:09|null|  null|       null|       click|    null|        null|
|95b48750-0193-11e...|2022-07-12 03:34:50|null|  null|       null|       click|    null|        null|
|a49e3a10-0975-11e...|2022-07-22 04:20:39|null|  null|       null|       click|    null|        null|
|bbab1a50-018e-11e...|2022-07-12 03:00:06|null|  null|       null|       click|    null|        null|
|bc3af1b0-0b67-11e...|2022-07-24 15:46:08|null|  null|       null|       click|   

In [12]:
click_data = click_data.na.fill({'bid':0})
click_data = click_data.na.fill({'job_id':0})
click_data = click_data.na.fill({'campaign_id':0})
click_data = click_data.na.fill({'group_id':0})
click_data = click_data.na.fill({'publisher_id':0})

In [23]:
click_data.registerTempTable('clicks')



In [24]:
spark.sql("""with cte1 as (select create_time , ts , date(ts) as Date , hour(ts) as hour,
bid,job_id,campaign_id,group_id,publisher_id from clicks)
select job_id,publisher_id,date,hour,campaign_id,group_id ,
sum(bid) as spend_hour , count(create_time) as clicks , avg(bid) as bid_set 
from cte1
group by job_id,publisher_id,date,hour,campaign_id,group_id""").show()

+------+------------+----------+----+-----------+--------+----------+------+------------------+
|job_id|publisher_id|      date|hour|campaign_id|group_id|spend_hour|clicks|           bid_set|
+------+------------+----------+----+-----------+--------+----------+------+------------------+
|     0|           0|2022-07-18|   7|          0|       0|         0|    21|               0.0|
|     0|           0|2022-07-12|   2|          0|       0|         0|   165|               0.0|
|     0|           0|2022-07-22|   4|          0|       0|         0|    10|               0.0|
|     0|           0|2022-07-24|  15|          0|       0|         0|    10|               0.0|
|     0|           0|2022-07-21|   2|          0|       0|         0|    26|               0.0|
|   258|           1|2022-07-26|   6|         93|       0|        12|    12|               1.0|
|   188|           1|2022-07-24|  10|         48|      34|        25|    25|               1.0|
|     0|           0|2022-07-12|   3|   