In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BigqueryExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Load data from BigQuery.




# Load tables

In [73]:
df_mortage = spark.read \
  .format("bigquery") \
  .load(" degroup11.group11dataset.mortage_amount_batch")    # project_id.datatset.tablename. Use your project id
df_mortage = df_mortage.withColumn("possible_mortgage_amount", col("possible_mortgage_amount").cast("int"))

df_mortage.printSchema()

root
 |-- ID: long (nullable = true)
 |-- spouse_ID: long (nullable = true)
 |-- max_to_spend_month: double (nullable = true)
 |-- possible_mortgage_amount: integer (nullable = true)



In [74]:
df_housing = spark.read \
  .format("bigquery") \
  .load(" degroup11.group11dataset.house_pricing")    # project_id.datatset.tablename. Use your project id
df_housing = df_housing.withColumn("Price", col("Price").cast("int"))
df_housing.printSchema()


root
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Lot_size: string (nullable = true)
 |-- Living_space_size: string (nullable = true)
 |-- Build_year: string (nullable = true)
 |-- Build_type: string (nullable = true)
 |-- House_type: string (nullable = true)
 |-- Roof: string (nullable = true)
 |-- Rooms: string (nullable = true)
 |-- Toilet: string (nullable = true)
 |-- Floors: string (nullable = true)
 |-- Energy_label: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Garden: string (nullable = true)
 |-- Estimated_neighbourhood_price_per: double (nullable = true)
 |-- Availability: boolean (nullable = true)
 |-- event_time: timestamp (nullable = true)



## Take a random person from the dataset

In [75]:
from pyspark.sql.functions import col, udf, window
from random import randint

def select_person_id():
    chosen_ID = randint(1, df_mortage.count())
    return chosen_ID

selected_df = df_mortage.filter( col("ID") == select_person_id())
selected_df.show()

+---+---------+------------------+------------------------+
| ID|spouse_ID|max_to_spend_month|possible_mortgage_amount|
+---+---------+------------------+------------------------+
|304|      304|           4144.03|                  823655|
+---+---------+------------------+------------------------+



## Take all the houses that that have a lower price than the possible mortage amount and are available

In [78]:
type(selected_df.first().asDict()["possible_mortgage_amount"])
(selected_df.first().asDict()["possible_mortgage_amount"]*0.7)

576558.5

In [79]:
available_houses_chosen_individual = df_housing.filter(col("Price") < selected_df.first()\
                                                       .asDict()["possible_mortgage_amount"])\
                                                        .filter(col("Availability") == True)

available_houses = available_houses_chosen_individual.select('Address','City','Price','Availability',"event_time").orderBy(col("Price").desc())
available_houses.show(10)


+--------------------+-------------------+------+------------+--------------------+
|             Address|               City| Price|Availability|          event_time|
+--------------------+-------------------+------+------------+--------------------+
|        Hoepmaker 10|        Papendrecht|820000|        true|2023-12-01 02:35:...|
|   Rietpolderlaan 17|             Muiden|820000|        true|2023-12-01 03:09:...|
|        Looveen 14 .|            Wijster|815000|        true|2023-12-01 00:08:...|
|Willem van Guliks...|             Arnhem|810000|        true|2023-11-30 21:40:...|
|Cornelis Troostla...|              Heeze|800000|        true|2023-11-30 20:18:...|
|    Vriezenbeemden 6|            Helmond|800000|        true|2023-12-01 01:42:...|
|      Astersingel 23|Berkel en Rodenrijs|800000|        true|2023-11-30 21:21:...|
|  Grevelingenhout 37|          Bruinisse|800000|        true|2023-11-30 21:36:...|
|        't Höltje 42|             Helden|800000|        true|2023-11-30 22: