# Step 1 of the plan

In [1]:
import sys
!{sys.executable} -m pip install google-cloud-bigquery



In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("batch idea")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

df_individuals = spark.read.format("csv").option("header", "true") \
       .load("data/individuals_updated.csv")

df_spouse = spark.read.format("csv").option("header", "true") \
       .load("data/spouse_updated.csv")

df_house = spark.read.format("csv").option("header", "true") \
       .load("data/house_pricing.csv")


In [10]:

df_individuals.printSchema()

root
 |-- ID: string (nullable = true)
 |-- age: string (nullable = true)
 |-- has_spouse: string (nullable = true)
 |-- spouse_ID: string (nullable = true)
 |-- gross_salary: string (nullable = true)
 |-- has_student_loan: string (nullable = true)
 |-- student_loan_amount: string (nullable = true)
 |-- has_general_loan: string (nullable = true)
 |-- general_loan_amount: string (nullable = true)
 |-- has_alimony: string (nullable = true)
 |-- alimony_amount: string (nullable = true)



DataFrame[ID: string, age: string, has_spouse: boolean, spouse_ID: string, gross_salary: int, has_student_loan: string, student_loan_amount: string, has_general_loan: string, general_loan_amount: string, has_alimony: string, alimony_amount: int, spendable_income: int]

In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Assuming your DataFrame is named df
df = df_individuals
# Convert necessary columns to appropriate data types (e.g., from string to integer/boolean)
df = df.withColumn("gross_salary", col("gross_salary").cast("int"))
df = df.withColumn("alimony_amount", col("alimony_amount").cast("int"))
df = df.withColumn("has_spouse", col("has_spouse") == "True")

# Apply the formula
df = df.withColumn("spendable_income", 
                   when(col("has_spouse") == False, col("gross_salary") - 12 * col("alimony_amount"))
                   .otherwise(col("gross_salary")))

# Show the result
df.select("ID", "has_spouse", "gross_salary", "alimony_amount", "spendable_income").show()


+---+----------+------------+--------------+----------------+
| ID|has_spouse|gross_salary|alimony_amount|spendable_income|
+---+----------+------------+--------------+----------------+
|  1|     false|      191453|             0|          191453|
|  2|     false|       82586|             0|           82586|
|  3|     false|      151688|             0|          151688|
|  4|     false|      107499|             0|          107499|
|  5|     false|       31355|             0|           31355|
|  6|     false|      114946|          1657|           95062|
|  7|     false|      193937|             0|          193937|
|  8|     false|      108115|           515|          101935|
|  9|     false|      168304|             0|          168304|
| 10|     false|      137758|           497|          131794|
| 11|     false|      156277|             0|          156277|
| 12|     false|      119447|             0|          119447|
| 13|     false|      111993|             0|          111993|
| 14|   

In [36]:
from pyspark.sql.functions import col


df_comb = df_individuals.alias("ind").join(df_spouse.alias("sp"), col("ind.ID") ==  col("sp.spouse_ID"),"inner")

df_comb.printSchema()

root
 |-- ID: string (nullable = true)
 |-- age: string (nullable = true)
 |-- has_spouse: string (nullable = true)
 |-- spouse_ID: string (nullable = true)
 |-- gross_salary: string (nullable = true)
 |-- has_student_loan: string (nullable = true)
 |-- student_loan_amount: string (nullable = true)
 |-- has_general_loan: string (nullable = true)
 |-- general_loan_amount: string (nullable = true)
 |-- has_alimony: string (nullable = true)
 |-- alimony_amount: string (nullable = true)
 |-- spouse_ID: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gross_salary: string (nullable = true)
 |-- has_student_loan: string (nullable = true)
 |-- student_loan_amount: string (nullable = true)
 |-- has_general_loan: string (nullable = true)
 |-- general_loan_amount: string (nullable = true)
 |-- has_alimony: string (nullable = true)
 |-- alimony_amount: string (nullable = true)



## This is the house mortage amount without taking into account who has the highest salary

In [48]:
df = df_comb
# Convert necessary columns to appropriate data types (e.g., from string to integer/boolean)
df = df.withColumn("ind.gross_salary", col("ind.gross_salary").cast("int"))
df = df.withColumn("ind.alimony_amount", col("ind.alimony_amount").cast("int"))
df = df.withColumn("ind.has_spouse", col("ind.has_spouse") == "True")
df = df.withColumn("sp.gross_salary", col("sp.gross_salary").cast("int"))
df = df.withColumn("sp.alimony_amount", col("sp.alimony_amount").cast("int"))

# Apply the formula
df = df.withColumn("house_spendable_income", 
                   when(col("ind.has_spouse") == False, col("ind.gross_salary") - 12 * col("ind.alimony_amount"))
                   .otherwise(col("ind.gross_salary") - 12 * col("ind.alimony_amount")+ 1/3 * (col("sp.gross_salary")-12 * col("sp.alimony_amount"))))

df = df.withColumn("house_spendable_income", col("house_spendable_income").cast("int"))
#df_comb.select("ind.student_loan_amount", "sp.student_loan_amount").show()
df.select("ind.gross_salary","ind.alimony_amount","ind.has_spouse","sp.gross_salary","sp.alimony_amount", "house_spendable_income").show()



+------------+--------------+----------+------------+--------------+----------------------+
|gross_salary|alimony_amount|has_spouse|gross_salary|alimony_amount|house_spendable_income|
+------------+--------------+----------+------------+--------------+----------------------+
|      191453|             0|      TRUE|       54528|             0|                209629|
|       82586|             0|      TRUE|      144409|   2513.721669|                120667|
|      151688|             0|     FALSE|       80291|             0|                151688|
|      107499|             0|      TRUE|       33528|             0|                118675|
|       31355|             0|      TRUE|      165719|             0|                 86594|
|      114946|   1657.604031|     FALSE|      147674|             0|                 95054|
|      193937|             0|      TRUE|      158092|             0|                246634|
|      108115|   515.4660385|      TRUE|       30499|             0|            

## Spendable income calculation where the highest income is also taken into account, since only 1/3 of the lower income is added

In [52]:
df = df_comb
# Convert necessary columns to appropriate data types (e.g., from string to integer/boolean)
df = df.withColumn("ind.gross_salary", col("ind.gross_salary").cast("int"))
df = df.withColumn("ind.alimony_amount", col("ind.alimony_amount").cast("int"))
df = df.withColumn("ind.has_spouse", col("ind.has_spouse") == "True")

df = df.withColumn("sp.gross_salary", col("sp.gross_salary").cast("int"))
df = df.withColumn("sp.alimony_amount", col("sp.alimony_amount").cast("int"))
                   
# Apply the formula
df = df.withColumn("house_spendable_income", 
                   when(col("ind.has_spouse") == False, 
                        col("ind.gross_salary") - 12 * col("ind.alimony_amount"))
                   .otherwise(when(col("ind.gross_salary")>col("sp.gross_salary"),col("ind.gross_salary") - 12 * col("ind.alimony_amount")+ 1/3 * (col("sp.gross_salary")-12 * col("sp.alimony_amount")))\
                             .otherwise(col("sp.gross_salary") - 12 * col("sp.alimony_amount")+ 1/3 * (col("ind.gross_salary")-12 * col("ind.alimony_amount")))))

df = df.withColumn("house_spendable_income", col("house_spendable_income").cast("int"))
#df_comb.select("ind.student_loan_amount", "sp.student_loan_amount").show()
df.select("ind.gross_salary","ind.alimony_amount","ind.has_spouse","sp.gross_salary","sp.alimony_amount", "house_spendable_income").show()



+------------+--------------+----------+------------+--------------+----------------------+
|gross_salary|alimony_amount|has_spouse|gross_salary|alimony_amount|house_spendable_income|
+------------+--------------+----------+------------+--------------+----------------------+
|      191453|             0|      TRUE|       54528|             0|                118345|
|       82586|             0|      TRUE|      144409|   2513.721669|                120667|
|      151688|             0|     FALSE|       80291|             0|                151688|
|      107499|             0|      TRUE|       33528|             0|                 69361|
|       31355|             0|      TRUE|      165719|             0|                 86594|
|      114946|   1657.604031|     FALSE|      147674|             0|                 95054|
|      193937|             0|      TRUE|      158092|             0|                246634|
|      108115|   515.4660385|      TRUE|       30499|             0|            

In [2]:
# Perform a query.
QUERY = (
    'SELECT * FROM `degroup11.group11dataset.individuals` LIMIT 100')   # use the correct project id, etc.
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

for row in rows:
    print(row)

Forbidden: 403 POST https://bigquery.googleapis.com/bigquery/v2/projects/degroup11/jobs?prettyPrint=false: Access Denied: Project degroup11: User does not have bigquery.jobs.create permission in project degroup11.

Location: None
Job ID: 442ff93c-a3cc-49fb-8617-766f1a6c5b76


In [None]:
Job ID: b7f980d4-3f39-4ce8-a54a-7284809f5fa1


In [10]:
spark.stop()

NameError: name 'spark' is not defined