## Initialize Google Big Query tables

In [15]:
import sys
#!{sys.executable} -m pip uninstall -q google-cloud-storage
!{sys.executable} -m pip install -q google-cloud-storage
!{sys.executable} -m pip install -q google-cloud-bigquery

In [18]:
from google.cloud import bigquery

# Initialize the BigQuery client
client = bigquery.Client(project="degroup11")  

# Define the table schema
schema = [
    bigquery.SchemaField("ID", "INTEGER"),
    bigquery.SchemaField("age", "INTEGER"),
    bigquery.SchemaField("has_spouse", "BOOLEAN"),
    bigquery.SchemaField("spouse_ID", "INTEGER"),
    bigquery.SchemaField("gross_salary", "INTEGER"),
    bigquery.SchemaField("has_student_loan", "BOOLEAN"),
    bigquery.SchemaField("student_loan_amount", "INTEGER"),
    bigquery.SchemaField("has_general_loan", "BOOLEAN"),
    bigquery.SchemaField("general_loan_amount", "INTEGER"),
    bigquery.SchemaField("has_alimony", "BOOLEAN"),
    bigquery.SchemaField("alimony_amount", "INTEGER"),
]

# Specify the BigQuery dataset and table
dataset_id = "group11dataset"  
table_id = "individuals"   

# Create the BigQuery table
table_ref = client.dataset(dataset_id).table(table_id)
table = bigquery.Table(table_ref, schema=schema)

# Create the table if it doesn't exist
client.create_table(table, exists_ok=True)

print(f"Table '{dataset_id}.{table_id}' created successfully.")

# Define the table schema
schema_spouse = [
    bigquery.SchemaField("spouse_ID", "INTEGER"),
    bigquery.SchemaField("age", "INTEGER"),
    bigquery.SchemaField("gross_salary", "INTEGER"),
    bigquery.SchemaField("has_student_loan", "BOOLEAN"),
    bigquery.SchemaField("student_loan_amount", "INTEGER"),
    bigquery.SchemaField("has_general_loan", "BOOLEAN"),
    bigquery.SchemaField("general_loan_amount", "INTEGER"),
    bigquery.SchemaField("has_alimony", "BOOLEAN"),
    bigquery.SchemaField("alimony_amount", "INTEGER"),
]

# Specify the BigQuery dataset and table
dataset_id = "group11dataset"    
table_id_spouse = "spouse"    

# Create the BigQuery table
table_ref_spouse = client.dataset(dataset_id).table(table_id_spouse)
table_spouse = bigquery.Table(table_ref_spouse, schema=schema_spouse)

# Create the table if it doesn't exist
client.create_table(table_spouse, exists_ok=True)
print(f"Table '{dataset_id}.{table_id_spouse}' created successfully.")

 #Define the table schema for the house data
schema_house = [
    bigquery.SchemaField("Address", "STRING"),
    bigquery.SchemaField("City", "STRING"),
    bigquery.SchemaField("Price", "INTEGER"),
    bigquery.SchemaField("Lot_size", "STRING"),
    bigquery.SchemaField("Living_space_size", "STRING"),
    bigquery.SchemaField("Build_year", "STRING"),
    bigquery.SchemaField("Build_type", "STRING"),
    bigquery.SchemaField("House_type", "STRING"),
    bigquery.SchemaField("Roof", "STRING"),
    bigquery.SchemaField("Rooms", "STRING"),
    bigquery.SchemaField("Toilet", "STRING"),
    bigquery.SchemaField("Floors", "STRING"),
    bigquery.SchemaField("Energy_label", "STRING"),
    bigquery.SchemaField("Position", "STRING"),
    bigquery.SchemaField("Garden", "STRING"),
    bigquery.SchemaField("Estimated_neighbourhood_price_per", "FLOAT"),
    bigquery.SchemaField("Availability", "BOOLEAN"),
    bigquery.SchemaField("event_time", "TIMESTAMP"),  
]

# Specify the BigQuery dataset and table
dataset_id = "group11dataset"  
table_id_house = "house_pricing"  

# Create the BigQuery table for house data
table_ref_house = client.dataset(dataset_id).table(table_id_house)
table_house = bigquery.Table(table_ref_house, schema=schema_house)

# Create the table if it doesn't exist
client.create_table(table_house, exists_ok=True)

print(f"Table '{dataset_id}.{table_id_house}' created successfully.")

Table 'group11dataset.individuals' created successfully.
Table 'group11dataset.spouse' created successfully.
Table 'group11dataset.house_pricing' created successfully.


## get the datasets from the Google Bucket

In [19]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("DataSourceSinkExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Specify the GCS path where you want to save the new DataFrames
output_path_ind = 'gs://data_degroup11/individuals_updated2.csv'
output_path_spouse = 'gs://data_degroup11/spouse_updated2.csv'
output_path_house = 'gs://data_degroup11/house_pricing_updated2.csv'

df_individuals =spark.read.csv(output_path_ind, header=True)
df_spouse = spark.read.csv(output_path_spouse, header=True)
df_housepricing = spark.read.csv(output_path_house, header = True)

df_individuals.printSchema()

root
 |-- ID: string (nullable = true)
 |-- age: string (nullable = true)
 |-- has_spouse: string (nullable = true)
 |-- spouse_ID: string (nullable = true)
 |-- gross_salary: string (nullable = true)
 |-- has_student_loan: string (nullable = true)
 |-- student_loan_amount: string (nullable = true)
 |-- has_general_loan: string (nullable = true)
 |-- general_loan_amount: string (nullable = true)
 |-- has_alimony: string (nullable = true)
 |-- alimony_amount: string (nullable = true)



# Saving the dataframes to the bigquery

In [20]:
df_individuals.write.format('bigquery') \
  .option('table', 'degroup11.group11dataset.individuals') \
  .option("temporaryGcsBucket", "temp_degroup11") \
  .mode("overwrite").save()
df_spouse.write.format('bigquery') \
  .option('table', 'degroup11.group11dataset.spouse') \
  .option("temporaryGcsBucket", "temp_degroup11") \
  .mode("overwrite").save()
df_housepricing.write.format('bigquery') \
  .option('table', 'degroup11.group11dataset.house_pricing') \
  .option("temporaryGcsBucket", "temp_degroup11") \
  .mode("overwrite").save()
print("dataframes have been written to the Google BigQuery")

dataframes have been written to the Google BigQuery


In [21]:
# Perform a query.
client = bigquery.Client(project="degroup11")   # use your project id

QUERY = (
    'SELECT * FROM `degroup11.group11dataset.individuals` LIMIT 10')   # use the correct project id, etc.
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

for row in rows:
    print(row)

Row(('106', '18', 'true', '106', '75526', 'false', '0', 'true', '9288', 'false', '0'), {'ID': 0, 'age': 1, 'has_spouse': 2, 'spouse_ID': 3, 'gross_salary': 4, 'has_student_loan': 5, 'student_loan_amount': 6, 'has_general_loan': 7, 'general_loan_amount': 8, 'has_alimony': 9, 'alimony_amount': 10})
Row(('153', '18', 'false', '153', '134600', 'false', '0', 'true', '5314', 'true', '1200'), {'ID': 0, 'age': 1, 'has_spouse': 2, 'spouse_ID': 3, 'gross_salary': 4, 'has_student_loan': 5, 'student_loan_amount': 6, 'has_general_loan': 7, 'general_loan_amount': 8, 'has_alimony': 9, 'alimony_amount': 10})
Row(('211', '18', 'true', '211', '144798', 'true', '26010', 'true', '1815', 'false', '0'), {'ID': 0, 'age': 1, 'has_spouse': 2, 'spouse_ID': 3, 'gross_salary': 4, 'has_student_loan': 5, 'student_loan_amount': 6, 'has_general_loan': 7, 'general_loan_amount': 8, 'has_alimony': 9, 'alimony_amount': 10})
Row(('354', '18', 'true', '354', '150580', 'true', '52758', 'true', '8222', 'false', '0'), {'ID': 

In [22]:
spark.stop()