## Initialize Google Big Query tables

In [1]:
import sys
#!{sys.executable} -m pip uninstall -q google-cloud-storage
!{sys.executable} -m pip install -q google-cloud-storage
!{sys.executable} -m pip install -q google-cloud-bigquery

## Create the schema struct fields for each table so that the dataframe is loaded with correct format

In [1]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, FloatType, TimestampType

schema_struct_ind = StructType([
    StructField("ID", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("has_spouse", BooleanType(), True),
    StructField("spouse_ID", IntegerType(), True),
    StructField("gross_salary", IntegerType(), True),
    StructField("has_student_loan", BooleanType(), True),
    StructField("student_loan_amount", IntegerType(), True),
    StructField("has_general_loan", BooleanType(), True),
    StructField("general_loan_amount", IntegerType(), True),
    StructField("has_alimony", BooleanType(), True),
    StructField("alimony_amount", IntegerType(), True),
])

schema_struct_spouse = StructType([
    StructField("spouse_ID", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("gross_salary", IntegerType(), True),
    StructField("has_student_loan", BooleanType(), True),
    StructField("student_loan_amount", IntegerType(), True),
    StructField("has_general_loan", BooleanType(), True),
    StructField("general_loan_amount", IntegerType(), True),
    StructField("has_alimony", BooleanType(), True),
    StructField("alimony_amount", IntegerType(), True),
])

schema_struct_house = StructType([
    StructField("Address", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Price", IntegerType(), True),
    StructField("Lot_size", StringType(), True),
    StructField("Living_space_size", StringType(), True),
    StructField("Build_year", StringType(), True),
    StructField("Build_type", StringType(), True),
    StructField("House_type", StringType(), True),
    StructField("Roof", StringType(), True),
    StructField("Rooms", StringType(), True),
    StructField("Toilet", StringType(), True),
    StructField("Floors", StringType(), True),
    StructField("Energy_label", StringType(), True),
    StructField("Position", StringType(), True),
    StructField("Garden", StringType(), True),
    StructField("Estimated_neighbourhood_price_per", FloatType(), True),
    StructField("Availability", BooleanType(), True),
    StructField("event_time", TimestampType(), True),
])


## load dataframe based on datasets from the Google Bucket

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("DataSourceSinkExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Specify the GCS path where you want to save the new DataFrames
output_path_ind = 'gs://data_degroup11/individuals_updated2.csv'
output_path_spouse = 'gs://data_degroup11/spouse_updated2.csv'
output_path_house = 'gs://data_degroup11/house_pricing_updated2.csv'

df_individuals =spark.read.load(output_path_ind, format="csv", schema=schema_struct_ind, header=True)
df_spouse = spark.read.load(output_path_spouse, format="csv", schema=schema_struct_spouse, header=True)
df_housepricing = spark.read.load(output_path_house, format="csv", schema=schema_struct_house, header=True)

df_individuals.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- has_spouse: boolean (nullable = true)
 |-- spouse_ID: integer (nullable = true)
 |-- gross_salary: integer (nullable = true)
 |-- has_student_loan: boolean (nullable = true)
 |-- student_loan_amount: integer (nullable = true)
 |-- has_general_loan: boolean (nullable = true)
 |-- general_loan_amount: integer (nullable = true)
 |-- has_alimony: boolean (nullable = true)
 |-- alimony_amount: integer (nullable = true)



In [4]:
df_housepricing.printSchema()

root
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Lot_size: string (nullable = true)
 |-- Living_space_size: string (nullable = true)
 |-- Build_year: string (nullable = true)
 |-- Build_type: string (nullable = true)
 |-- House_type: string (nullable = true)
 |-- Roof: string (nullable = true)
 |-- Rooms: string (nullable = true)
 |-- Toilet: string (nullable = true)
 |-- Floors: string (nullable = true)
 |-- Energy_label: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Garden: string (nullable = true)
 |-- Estimated_neighbourhood_price_per: float (nullable = true)
 |-- Availability: boolean (nullable = true)
 |-- event_time: timestamp (nullable = true)



# Saving the dataframes to the bigquery

In [5]:
df_individuals.write.format('bigquery') \
  .option('table', 'degroup11.group11dataset.individuals') \
  .option("temporaryGcsBucket", "temp_degroup11") \
  .mode("overwrite").save()
df_spouse.write.format('bigquery') \
  .option('table', 'degroup11.group11dataset.spouse') \
  .option("temporaryGcsBucket", "temp_degroup11") \
  .mode("overwrite").save()
df_housepricing.write.format('bigquery') \
  .option('table', 'degroup11.group11dataset.house_pricing') \
  .option("temporaryGcsBucket", "temp_degroup11") \
  .mode("overwrite").save()
print("dataframes have been written to the Google BigQuery")

dataframes have been written to the Google BigQuery


In [6]:
spark.stop()