In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import col,lit, rand


sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("DataSourceSinkExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
gsc_file_path = 'gs://data_degroup11/house_pricing.csv'  #  use your gcp bucket name. Also upload sales.csv first
gsc_file_path_2 = 'gs://data_degroup11/individuals.csv' 
gsc_file_path_3 = 'gs://data_degroup11/spouse.csv'
# Create data frame
df_house = spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path)
df_house.printSchema()

df_ind= spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path_2)
df_ind.printSchema()

df_spouse = spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path_3)
df_spouse.printSchema()

df = spark.read.format("csv").option("header", "true") \
       .load("/home/jovyan/data/house_pricing.csv")
df.printSchema()

df_ind = df_ind.na.fill({'has_alimony': False})
df_spouse = df_spouse.na.fill({'has_alimony': False})

newDf_ind = df_ind.select('has_alimony')  # select one column

newDf_ind.show()

newDf_spouse = df_spouse.select('has_alimony')  # select one column

newDf_spouse.show()

# Rename the 'ID' column to 'spouse_ID' in df_spouse
df_spouse = df_spouse.withColumnRenamed('ID', 'spouse_ID')

# Show the updated DataFrame
df_spouse.show()

# Add a new column 'spouse_id' with the same values as 'ID'
df_ind = df_ind.withColumn('spouse_id', col('ID'))

# Show the updated DataFrame
df_ind.show()

# Print the column names of df_ind
print("Column Names of df_ind:")
print(df_ind.columns)

# Reorder the columns based on the desired configuration
new_columns = ['ID', 'age', 'has_spouse', 'spouse_ID', 'gross_salary', 'has_student_loan', 'student_loan_amount', 'has_general_loan', 'general_loan_amount', 'has_alimony', 'alimony_amount']
df_ind = df_ind.select(*new_columns)

# Add a new column 'Availability' with random True or False values
df_house = df_house.withColumn('Availability', (rand() < lit(0.5)))

newDf_house = df_house.select('Availability')  # select one column

newDf_house.show()

# Specify the GCS path where you want to save the new DataFrames
output_path_ind = 'gs://data_degroup11/individuals_updated2.csv'
output_path_spouse = 'gs://data_degroup11/spouse_updated2.csv'
output_path_house = 'gs://data_degroup11/house_pricing_updated2.csv'

output_path_ind_test = 'gs://data_degroup11/individuals_updated_test.csv'
output_path_spouse_test = 'gs://data_degroup11/spouse_updated_test.csv'


# Write each partition to a separate CSV file on GCS
df_ind.write.partitionBy('ID').format("csv").option("header", "true").mode("overwrite").save(output_path_ind_test)
df_spouse.write.partitionBy('spouse_ID').format("csv").option("header", "true").mode("overwrite").save(output_path_spouse_test)

# Write the new DataFrames to GCS
df_ind.write.format("csv").option("header", "true").mode("overwrite").save(output_path_ind)
df_spouse.write.format("csv").option("header", "true").mode("overwrite").save(output_path_spouse)
df_house.write.format("csv").option("header", "true").mode("overwrite").save(output_path_house)

In [None]:
spark.stop()

In [None]:
#hadoop fs -getmerge -nl gs://data_degroup11/individuals_updated_test/ gs://data_degroup11/individuals_updated_test.csv
#hadoop fs -getmerge -nl gs://data_degroup11/spouse_updated_test/ gs://data_degroup11/spouse_updated_test/spouse_updated_test.csv