In [7]:
import sys
!{sys.executable} -m pip install google-cloud-storage

Collecting google-cloud-storage
  Downloading google_cloud_storage-2.13.0-py2.py3-none-any.whl.metadata (6.1 kB)
Downloading google_cloud_storage-2.13.0-py2.py3-none-any.whl (121 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.1/121.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: google-cloud-storage
Successfully installed google-cloud-storage-2.13.0


## get the datasets from the Google Bucket

In [22]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("DataSourceSinkExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Specify the GCS path where you want to save the new DataFrames
output_path_ind = 'gs://data_degroup11/individuals_updated2.csv'
output_path_spouse = 'gs://data_degroup11/spouse_updated2.csv'
output_path_house = 'gs://data_degroup11/house_pricing_updated2.csv'

df_individuals =spark.read.csv(output_path_ind, header=True)
df_spouse = spark.read.csv(output_path_spouse, header=True)
df_housepricing = spark.read.csv(output_path_house, header = True)

df_individuals.printSchema()

root
 |-- ID: string (nullable = true)
 |-- age: string (nullable = true)
 |-- has_spouse: string (nullable = true)
 |-- spouse_ID: string (nullable = true)
 |-- gross_salary: string (nullable = true)
 |-- has_student_loan: string (nullable = true)
 |-- student_loan_amount: string (nullable = true)
 |-- has_general_loan: string (nullable = true)
 |-- general_loan_amount: string (nullable = true)
 |-- has_alimony: string (nullable = true)
 |-- alimony_amount: string (nullable = true)



# Saving the dataframes to the bigquery

In [26]:
df_individuals.write.format('bigquery') \
  .option('table', 'degroup11.group11dataset.individuals') \
  .option("temporaryGcsBucket", "temp_degroup11") \
  .mode("overwrite").save()
df_spouse.write.format('bigquery') \
  .option('table', 'degroup11.group11dataset.spouse') \
  .option("temporaryGcsBucket", "temp_degroup11") \
  .mode("overwrite").save()
df_housepricing.write.format('bigquery') \
  .option('table', 'degroup11.group11dataset.house_pricing') \
  .option("temporaryGcsBucket", "temp_degroup11") \
  .mode("overwrite").save()
print("dataframes have been written to the Google BigQuery")

dataframes have been written to the Google BigQuery


In [28]:
# Perform a query.
client = bigquery.Client(project="degroup11")   # use your project id

QUERY = (
    'SELECT * FROM `degroup11.group11dataset.individuals` LIMIT 100')   # use the correct project id, etc.
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

for row in rows:
    print(row)

Row(('106', '18', 'true', '106', '75526', 'false', '0', 'true', '9288', 'false', '0'), {'ID': 0, 'age': 1, 'has_spouse': 2, 'spouse_ID': 3, 'gross_salary': 4, 'has_student_loan': 5, 'student_loan_amount': 6, 'has_general_loan': 7, 'general_loan_amount': 8, 'has_alimony': 9, 'alimony_amount': 10})
Row(('153', '18', 'false', '153', '134600', 'false', '0', 'true', '5314', 'true', '1200'), {'ID': 0, 'age': 1, 'has_spouse': 2, 'spouse_ID': 3, 'gross_salary': 4, 'has_student_loan': 5, 'student_loan_amount': 6, 'has_general_loan': 7, 'general_loan_amount': 8, 'has_alimony': 9, 'alimony_amount': 10})
Row(('211', '18', 'true', '211', '144798', 'true', '26010', 'true', '1815', 'false', '0'), {'ID': 0, 'age': 1, 'has_spouse': 2, 'spouse_ID': 3, 'gross_salary': 4, 'has_student_loan': 5, 'student_loan_amount': 6, 'has_general_loan': 7, 'general_loan_amount': 8, 'has_alimony': 9, 'alimony_amount': 10})
Row(('354', '18', 'true', '354', '150580', 'true', '52758', 'true', '8222', 'false', '0'), {'ID': 

In [None]:
spark.stop()