In [2]:
import sys
!{sys.executable} -m pip install -q --upgrade pip
!{sys.executable} -m pip install -q google-cloud-bigquery

## Merge the kafka dataset of the right individual with their available houses and outputs into a final bigquery dataset

In [14]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BigqueryExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#Loading the Cookie
cookie_id_df = spark.read \
      .format("bigquery") \
      .load(" degroup11.group11dataset.cookie_ID_houses")

#Getting the mortgage threshold
price_threshold = cookie_id_df.first()["possible_mortgage_amount"]
print(price_threshold)

# Load data from BigQuery.
df_kafka = spark.read \
  .format("bigquery") \
  .load(" degroup11.group11dataset.house_pricing_kafka")    

df_kafka=df_kafka.drop("window")
df_kafka.printSchema()

df_batch = spark.read \
  .format("bigquery") \
  .load(" degroup11.group11dataset.available_houses_for_individual")  
df_batch.printSchema()

df_combined = df_batch.union(df_kafka)

df_combined=df_combined.dropDuplicates(["Address", "Price"])

df_combined = df_combined.orderBy("Price", ascending=False)
df_combined.show()

df_combined.write.format("bigquery").\
option('table', "degroup11.group11dataset.top_10_houses").\
option("temporaryGcsBucket", "temp_degroup11"). \
mode("overwrite").save()

414389
root
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Price: long (nullable = true)
 |-- Availability: boolean (nullable = true)
 |-- event_time: timestamp (nullable = true)

root
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Price: long (nullable = true)
 |-- Availability: boolean (nullable = true)
 |-- event_time: timestamp (nullable = true)

+--------------------+---------------+------+------------+--------------------+
|             Address|           City| Price|Availability|          event_time|
+--------------------+---------------+------+------------+--------------------+
|         Papesteeg 6|           Tiel|410000|        true|2023-12-01 03:20:...|
|          Stedehof 3|          Assen|410000|        true|2023-12-01 03:18:...|
|    Waterkerslaan 16|Oosterhout (NB)|410000|        true|2023-11-30 22:06:...|
|Willem van Velsen...|      Heemskerk|410000|        true|2023-11-30 22:12:...|
|Johan Willem Fris...

In [12]:
from google.cloud import bigquery

#Initialize the BigQuery client
client = bigquery.Client(project="degroup11")

#Delete temporary kafka table
client.delete_table("degroup11.group11dataset.house_pricing_kafka", not_found_ok=True)

#Delete temporary batch table
client.delete_table("degroup11.group11dataset.available_houses_for_individual", not_found_ok=True)

#Delete cookie as session is over
client.delete_table("degroup11.group11dataset.cookie_ID_houses", not_found_ok=True)