In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from datetime import datetime
import pyspark.sql.functions as F
from pyspark.sql.functions import expr
from pyspark.sql.window import Window

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("DataSourceSinkExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

## Get dataframes from google cloud

In [2]:


#  Google Storage File Path
gsc_file_path = 'gs://data_degroup11/house_pricing.csv'  #  use your gcp bucket name. Also upload sales.csv first
gsc_file_path_2 = 'gs://data_degroup11/individuals.csv'
gsc_file_path_3 = 'gs://data_degroup11/spouse.csv'
# Create data frame
df_house = spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path)
df_house.printSchema()

df_ind= spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path_2)
df_ind.printSchema()

df_spouse = spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path_3)
df_spouse.printSchema()

root
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Lot_size: string (nullable = true)
 |-- Living_space_size: string (nullable = true)
 |-- Build_year: string (nullable = true)
 |-- Build_type: string (nullable = true)
 |-- House_type: string (nullable = true)
 |-- Roof: string (nullable = true)
 |-- Rooms: string (nullable = true)
 |-- Toilet: string (nullable = true)
 |-- Floors: string (nullable = true)
 |-- Energy_label: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Garden: string (nullable = true)
 |-- Estimated_neighbourhood_price_per: string (nullable = true)

root
 |-- ID: string (nullable = true)
 |-- age: string (nullable = true)
 |-- has_spouse: string (nullable = true)
 |-- gross_salary: string (nullable = true)
 |-- has_student_loan: string (nullable = true)
 |-- student_loan_amount: string (nullable = true)
 |-- has_general_loan: string (nullable = true)
 |-- general_loan_amount

# Preprocess the dataframes

In [3]:
# remove missing values
df_ind = df_ind.na.fill({'has_alimony': False})
df_spouse = df_spouse.na.fill({'has_alimony': False})

## Preproceses the individuals & Spouse dataframe

In [4]:
from pyspark.sql.functions import col, expr, lit, current_timestamp, when, rand

# Print the column names of df_ind
print("Column Names of df_ind:")
print(df_ind.columns)

# Rename the 'ID' column to 'spouse_ID' in df_spouse
df_spouse = df_spouse.withColumnRenamed('ID', 'spouse_ID')

# Add a new column 'spouse_id' with the same values as 'ID'
df_ind = df_ind.withColumn('spouse_ID', col('ID'))

# Reorder the columns based on the desired configuration
new_columns = ['ID', 'age', 'has_spouse', 'spouse_ID', 'gross_salary', 'has_student_loan', 'student_loan_amount', 'has_general_loan', 'general_loan_amount', 'has_alimony', 'alimony_amount']
df_ind = df_ind.select(*new_columns)

# Round up the values in the alimony_amount column and cast to integer
df_ind = df_ind.withColumn("alimony_amount", col("alimony_amount").cast("int"))
df_ind = df_ind.withColumn("alimony_amount", expr("ROUND(alimony_amount)").cast("int"))
df_ind = df_ind.withColumn("has_spouse", col("has_spouse").cast("boolean"))


df_spouse = df_spouse.withColumn("alimony_amount", col("alimony_amount").cast("int"))
df_spouse = df_spouse.withColumn("alimony_amount", expr("ROUND(alimony_amount)").cast("int"))

df_ind.printSchema()

Column Names of df_ind:
['ID', 'age', 'has_spouse', 'gross_salary', 'has_student_loan', 'student_loan_amount', 'has_general_loan', 'general_loan_amount', 'has_alimony', 'alimony_amount']
root
 |-- ID: string (nullable = true)
 |-- age: string (nullable = true)
 |-- has_spouse: boolean (nullable = true)
 |-- spouse_ID: string (nullable = true)
 |-- gross_salary: string (nullable = true)
 |-- has_student_loan: string (nullable = true)
 |-- student_loan_amount: string (nullable = true)
 |-- has_general_loan: string (nullable = true)
 |-- general_loan_amount: string (nullable = true)
 |-- has_alimony: string (nullable = false)
 |-- alimony_amount: integer (nullable = true)



## Preprocess the housing dataframe

In [5]:
# Add a new column 'Availability' with random True or False values
df_house = df_house.withColumn('Availability', rand() < lit(0.5))

# Clean the 'Price' column and convert it to integer
df_house = df_house.withColumn('Price', expr("CAST(REGEXP_REPLACE(SUBSTR(Price, 3), '[^0-9]', '') AS INT)"))

print(df_house.count())
print("there are ", df_house.distinct().count(), "rows in the dataframe")
#print(df_house.select('Address', "City", "Price").distinct().count())

df_house = df_house.dropDuplicates(("Address","City", "Price"))
print("now there are ", df_house.count(), "rows in the dataframe")

# Define a window specification
window_spec = Window().orderBy(F.monotonically_increasing_id())

# Add a new column 'event_time' with a timestamp expression and a 5-second interval
df_house = df_house.withColumn('event_time', current_timestamp() + F.expr("interval 5 seconds") * F.row_number().over(window_spec))


5555
there are  5519 rows in the dataframe
now there are  5489 rows in the dataframe


## Save the dataframes to the google cloud bucket

In [6]:
newdf=df_house.select('event_time')
newdf.show()


newDf_house = df_house.select('Availability', 'Price')  # select columns

newDf_house.show()

# Specify the GCS path where you want to save the new DataFrames
output_path_ind = 'gs://data_degroup11/individuals_updated2.csv'
output_path_spouse = 'gs://data_degroup11/spouse_updated2.csv'
output_path_house = 'gs://data_degroup11/house_pricing_updated2.csv'

# Write the new DataFrames to GCS
df_ind.write.format("csv").option("header", "true").mode("overwrite").save(output_path_ind)
df_spouse.write.format("csv").option("header", "true").mode("overwrite").save(output_path_spouse)
df_house.write.format("csv").option("header", "true").mode("overwrite").save(output_path_house)

+--------------------+
|          event_time|
+--------------------+
|2023-12-04 18:15:...|
|2023-12-04 18:15:...|
|2023-12-04 18:15:...|
|2023-12-04 18:15:...|
|2023-12-04 18:15:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:16:...|
|2023-12-04 18:17:...|
|2023-12-04 18:17:...|
|2023-12-04 18:17:...|
+--------------------+
only showing top 20 rows

+------------+-------+
|Availability|  Price|
+------------+-------+
|        true| 462500|
|       false| 350000|
|       false| 625000|
|        true| 420000|
|       false| 375000|
|        true| 189000|
|        true| 365000|
|        true|1890000|
|       false| 580000|
|        true| 250000|
|       false| 415000|
|        true| 325000|
|        true| 570000|
|        true| 425000|
|        true| 525000|
|       

In [7]:
spark.stop()