## This notebook outputs a bigquery table with all the possible houses a random individual can afford. The random individual is randomly chosen. The randomly chosen individual is also created in a bigquery table as a cookie so that it can be identified

In [22]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
import sys
!{sys.executable} -m pip install -q google-cloud-bigquery
from google.cloud import bigquery

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BigqueryExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Load tables

In [23]:
from pyspark.sql.functions import col
df_mortgage = spark.read \
  .format("bigquery") \
  .load(" degroup11.group11dataset.mortgage_amount_batch")    # project_id.datatset.tablename. Use your project id
df_mortgage = df_mortgage.withColumn("possible_mortgage_amount", col("possible_mortgage_amount").cast("int"))

df_mortgage.printSchema()
df_mortgage.show()

root
 |-- ID: long (nullable = true)
 |-- spouse_ID: long (nullable = true)
 |-- max_to_spend_month: long (nullable = true)
 |-- possible_mortgage_amount: integer (nullable = true)

+---+---------+------------------+------------------------+
| ID|spouse_ID|max_to_spend_month|possible_mortgage_amount|
+---+---------+------------------+------------------------+
|734|      734|              1280|                  254494|
|330|      330|              3840|                  763401|
|321|      321|              1025|                  203744|
|459|      459|              3841|                  763613|
|972|      972|              -255|                  -50718|
|128|      128|              3586|                  712909|
| 14|       14|              4098|                  814678|
|758|      758|              5378|                 1068931|
|168|      168|               515|                  102510|
|322|      322|               515|                  102440|
|106|      106|              2051|    

In [24]:
df_housing = spark.read \
  .format("bigquery") \
  .load(" degroup11.group11dataset.house_pricing")    # project_id.datatset.tablename. Use your project id
df_housing = df_housing.withColumn("Price", col("Price").cast("int"))
df_housing.printSchema()


root
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Lot_size: string (nullable = true)
 |-- Living_space_size: string (nullable = true)
 |-- Build_year: string (nullable = true)
 |-- Build_type: string (nullable = true)
 |-- House_type: string (nullable = true)
 |-- Roof: string (nullable = true)
 |-- Rooms: string (nullable = true)
 |-- Toilet: string (nullable = true)
 |-- Floors: string (nullable = true)
 |-- Energy_label: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Garden: string (nullable = true)
 |-- Estimated_neighbourhood_price_per: double (nullable = true)
 |-- Availability: boolean (nullable = true)
 |-- event_time: timestamp (nullable = true)



## Take a random person from the dataset

In [25]:
from pyspark.sql.functions import col, udf, window
from random import randint

def select_person_id():
    chosen_ID = randint(1, df_mortgage.count())
    return chosen_ID

selected_df = df_mortgage.filter( col("ID") == select_person_id())
selected_df.show()

+---+---------+------------------+------------------------+
| ID|spouse_ID|max_to_spend_month|possible_mortgage_amount|
+---+---------+------------------+------------------------+
|641|      641|              4152|                  825276|
+---+---------+------------------+------------------------+



In [26]:
df_housing.select("Availability").show()

+------------+
|Availability|
+------------+
|       false|
|        true|
|        true|
|        true|
|       false|
|       false|
|        true|
|        true|
|        true|
|        true|
|        true|
|        true|
|       false|
|        true|
|       false|
|       false|
|        true|
|        true|
|       false|
|       false|
+------------+
only showing top 20 rows



In [27]:
available_houses_chosen_individual = df_housing.filter(col("Price") < selected_df.first()\
                                                       .asDict()["possible_mortgage_amount"])\
                                                        .filter(col("Availability") == True)
available_houses_chosen_individual.show()

+--------------------+-----------+------+--------+-----------------+----------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+---------------------------------+------------+--------------------+
|             Address|       City| Price|Lot_size|Living_space_size|Build_year|    Build_type|          House_type|                Roof|               Rooms|              Toilet|              Floors|  Energy_label|            Position|              Garden|Estimated_neighbourhood_price_per|Availability|          event_time|
+--------------------+-----------+------+--------+-----------------+----------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+---------------------------------+------------+--------------------+
|         Vikinghof 1|  H

## Take all the houses that that have a lower price than the possible mortage amount and are available

In [28]:
available_houses_chosen_individual = df_housing.filter(col("Price") < selected_df.first()\
                                                       .asDict()["possible_mortgage_amount"])\
                                                        .filter(col("Availability") == True)

available_houses = available_houses_chosen_individual.select('Address','City','Price','Availability',"event_time").orderBy(col("Price").desc())
available_houses.show(10)


+--------------------+--------------+------+------------+--------------------+
|             Address|          City| Price|Availability|          event_time|
+--------------------+--------------+------+------------+--------------------+
|   Raiffeisenlaan 46|       Utrecht|825000|        true|2023-12-05 17:18:...|
|      Molenstraat 47|       Monster|825000|        true|2023-12-05 16:24:...|
|      Noordstraat 60|    Bodegraven|825000|        true|2023-12-05 16:38:...|
|      Lindestraat 10|St. Willebrord|825000|        true|2023-12-05 15:57:...|
|    Nieuwehaven 19 C|          Edam|825000|        true|2023-12-05 16:33:...|
|      Papenstraat 15|      Deventer|825000|        true|2023-12-05 16:54:...|
|       Aekingaweg 12|     Appelscha|825000|        true|2023-12-05 12:01:...|
|J.W. van Puttestr...|        Ameide|825000|        true|2023-12-05 14:54:...|
|Maerten van Heems...|     Beverwijk|825000|        true|2023-12-05 16:05:...|
|Distelvlinderstra...|      Aalsmeer|825000|        

In [29]:
selected_houses = spark.createDataFrame(available_houses.take(500))
selected_houses.count()
selected_houses.distinct().count()

500

## Create the selected houses notebook

In [30]:
# Initialize the BigQuery client
client = bigquery.Client(project="degroup11")  

# Define the table schema
schema = [
    bigquery.SchemaField("Address", "STRING"),
    bigquery.SchemaField("City", "STRING"),
    bigquery.SchemaField("Price", "STRING"),
    bigquery.SchemaField("Availability", "BOOLEAN"),
    bigquery.SchemaField("event_time", "TIMESTAMP"),

]

# Specify the BigQuery dataset and table
dataset_id = "group11dataset"  
table_id = "available_houses_for_individual"   

# Create the BigQuery table
table_ref = client.dataset(dataset_id).table(table_id)
table = bigquery.Table(table_ref, schema=schema)

# Create the table if it doesn't exist
client.create_table(table, exists_ok=True)

Table(TableReference(DatasetReference('degroup11', 'group11dataset'), 'available_houses_for_individual'))

## write Dataframe houses available for individual to bigquery

In [31]:
selected_houses.write.format("bigquery").\
option('table', "degroup11.group11dataset.available_houses_for_individual").\
option("temporaryGcsBucket", "temp_degroup11"). \
mode("overwrite").save()

## Create the identifying table 

In [32]:
# Initialize the BigQuery client
client = bigquery.Client(project="degroup11")  

# Define the table schema
schema = [
    bigquery.SchemaField("ID", "STRING"),
    bigquery.SchemaField("spouse_ID", "STRING"),
    bigquery.SchemaField("max_to_spend_month", "INTEGER"),
    bigquery.SchemaField("possible_mortgage_amount", "INTEGER"),

]

# Specify the BigQuery dataset and table
dataset_id = "group11dataset"  
table_id = "cookie_ID_houses"   

# Create the BigQuery table
table_ref = client.dataset(dataset_id).table(table_id)
table = bigquery.Table(table_ref, schema=schema)

# Create the table if it doesn't exist
client.create_table(table, exists_ok=True)

Table(TableReference(DatasetReference('degroup11', 'group11dataset'), 'cookie_ID_houses'))

## Write dataframe cookie to bigquery

In [33]:
selected_df.write.format("bigquery").\
option('table', "degroup11.group11dataset.cookie_ID_houses").\
option("temporaryGcsBucket", "temp_degroup11"). \
mode("overwrite").save()

In [34]:
spark.stop()