In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
import sys
!{sys.executable} -m pip install google-cloud-bigquery
from google.cloud import bigquery

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BigqueryExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

Collecting google-cloud-bigquery
  Downloading google_cloud_bigquery-3.13.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting grpcio<2.0dev,>=1.47.0 (from google-cloud-bigquery)
  Downloading grpcio-1.59.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-bigquery)
  Downloading google_api_core-2.14.0-py3-none-any.whl.metadata (2.6 kB)
Collecting proto-plus<2.0.0dev,>=1.15.0 (from google-cloud-bigquery)
  Downloading proto_plus-1.22.3-py3-none-any.whl.metadata (2.2 kB)
Collecting google-cloud-core<3.0.0dev,>=1.6.0 (from google-cloud-bigquery)
  Downloading google_cloud_core-2.3.3-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting google-resumable-media<3.0dev,>=0.6.0 (from google-cloud-bigquery)
  Downloading google_resumable_media-2.6.0-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting googlea

# Load tables

In [3]:
from pyspark.sql.functions import col
df_mortage = spark.read \
  .format("bigquery") \
  .load(" degroup11.group11dataset.mortage_amount_batch")    # project_id.datatset.tablename. Use your project id
df_mortage = df_mortage.withColumn("possible_mortgage_amount", col("possible_mortgage_amount").cast("int"))

df_mortage.printSchema()

root
 |-- ID: long (nullable = true)
 |-- spouse_ID: long (nullable = true)
 |-- max_to_spend_month: double (nullable = true)
 |-- possible_mortgage_amount: integer (nullable = true)



In [4]:
df_housing = spark.read \
  .format("bigquery") \
  .load(" degroup11.group11dataset.house_pricing")    # project_id.datatset.tablename. Use your project id
df_housing = df_housing.withColumn("Price", col("Price").cast("int"))
df_housing.printSchema()


root
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Lot_size: string (nullable = true)
 |-- Living_space_size: string (nullable = true)
 |-- Build_year: string (nullable = true)
 |-- Build_type: string (nullable = true)
 |-- House_type: string (nullable = true)
 |-- Roof: string (nullable = true)
 |-- Rooms: string (nullable = true)
 |-- Toilet: string (nullable = true)
 |-- Floors: string (nullable = true)
 |-- Energy_label: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Garden: string (nullable = true)
 |-- Estimated_neighbourhood_price_per: double (nullable = true)
 |-- Availability: boolean (nullable = true)
 |-- event_time: timestamp (nullable = true)



## Take a random person from the dataset

In [30]:
from pyspark.sql.functions import col, udf, window
from random import randint

def select_person_id():
    chosen_ID = randint(1, df_mortage.count())
    return chosen_ID

selected_df = df_mortage.filter( col("ID") == select_person_id())
selected_df.show()

+---+---------+------------------+------------------------+
| ID|spouse_ID|max_to_spend_month|possible_mortgage_amount|
+---+---------+------------------+------------------------+
|855|      855|          2503.635|                  497615|
+---+---------+------------------+------------------------+



## Take all the houses that that have a lower price than the possible mortage amount and are available

In [31]:
available_houses_chosen_individual = df_housing.filter(col("Price") < selected_df.first()\
                                                       .asDict()["possible_mortgage_amount"])\
                                                        .filter(col("Availability") == True)

available_houses = available_houses_chosen_individual.select('Address','City','Price','Availability',"event_time").orderBy(col("Price").desc())
available_houses.show(10)


+--------------------+----------+------+------------+--------------------+
|             Address|      City| Price|Availability|          event_time|
+--------------------+----------+------+------------+--------------------+
|Koningin Wilhelmi...|    Houten|497000|        true|2023-12-01 02:50:...|
|       Lazuursteen 9|    Houten|495000|        true|2023-12-01 00:00:...|
|     Monnickskamp 77|    Huizen|495000|        true|2023-12-01 02:22:...|
|Wethouder Roodenb...|   Haarlem|495000|        true|2023-12-01 03:04:...|
|     Weberstraat 144|Amersfoort|495000|        true|2023-12-01 01:16:...|
| Wieger Bruinlaan 34| Hoofddorp|495000|        true|2023-12-01 00:04:...|
|       Hanzestraat 7|Maasbommel|495000|        true|2023-12-01 03:31:...|
|   Stationsstraat 35|Roosendaal|495000|        true|2023-11-30 20:28:...|
|    Waltmanstraat 49|Zwaanshoek|495000|        true|2023-12-01 01:43:...|
|       Galjoen 19 12|  Lelystad|495000|        true|2023-12-01 02:16:...|
+--------------------+---

In [32]:
selected_houses = spark.createDataFrame(available_houses.take(500))
selected_houses.count()

500

In [33]:
# Initialize the BigQuery client
client = bigquery.Client(project="degroup11")  

# Define the table schema
schema = [
    bigquery.SchemaField("Address", "STRING"),
    bigquery.SchemaField("City", "STRING"),
    bigquery.SchemaField("Price", "STRING"),
    bigquery.SchemaField("Availability", "BOOLEAN"),
    bigquery.SchemaField("event_time", "TIMESTAMP"),

]

# Specify the BigQuery dataset and table
dataset_id = "group11dataset"  
table_id = "available_houses_for_individual"   

# Create the BigQuery table
table_ref = client.dataset(dataset_id).table(table_id)
table = bigquery.Table(table_ref, schema=schema)

# Create the table if it doesn't exist
client.create_table(table, exists_ok=True)

Table(TableReference(DatasetReference('degroup11', 'group11dataset'), 'available_houses_for_individual'))

In [34]:
selected_houses.write.format("bigquery").\
option('table', "degroup11.group11dataset.available_houses_for_individual").\
option("temporaryGcsBucket", "tomporay_bucket"). \
mode("overwrite").save()