In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.Builder().appName("Analysis").master("local[12]").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/09 09:47:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/09 09:47:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/10/09 09:47:15 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/10/09 09:47:15 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [2]:
df = spark.read.parquet("../test_data.parquet")
#df.show()

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import matplotlib as mpl
from pyspark.sql import functions as fn

In [4]:
annuities = df.filter(df['Payment Type'] == "Annuity")
linears = df.filter(df['Payment Type'] == "Linear")
bullets = df.filter(df['Payment Type'] == "Bullet")

In [5]:
linearpayments = linears.rdd.map(
    lambda x: (x['Id'], x['Notional'] / x['Term']))

linearpayments = linearpayments.toDF(["f_Id", "monthly_repayments"]) 
#linearpayments.show()

In [6]:
bulletpayments = bullets.rdd.map(
    lambda x: (x["Id"], 0.0)
).toDF(["f_Id", "monthly_payments"])

In [7]:
def calc_annuity_payment(notional, interest, term):
    monthsTotal = term * 12
    r = interest / 12 / 100
    payAmount = (notional*r*(pow(1+r, monthsTotal))) / (pow(1+r, monthsTotal) -1)
    return payAmount

annuitypayments = annuities.rdd.map(
    lambda x: (x["Id"], calc_annuity_payment(x["Notional"], x["Interest Rate"], x["Term"]) )
).toDF(["f_Id", "monthly_payments"])

In [8]:
#annuities.show()

In [9]:
annuities = annuities.join(annuitypayments, annuities['Id'] == annuitypayments['f_Id'], "inner")
linears = linears.join(linearpayments, linears['Id'] == linearpayments['f_Id'], "inner")
bullets = bullets.join(bulletpayments, bullets['Id'] == bulletpayments['f_Id'], "inner")
df_full = annuities.union(linears).union(bullets)

In [10]:
df_full.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Notional: long (nullable = true)
 |-- Interest Rate: double (nullable = true)
 |-- Reset Frequency: long (nullable = true)
 |-- Start Date: timestamp_ntz (nullable = true)
 |-- Term: long (nullable = true)
 |-- Remaining Notional: double (nullable = true)
 |-- Payment Type: string (nullable = true)
 |-- Risk Indicator: long (nullable = true)
 |-- Next Reset Date: timestamp_ntz (nullable = true)
 |-- f_Id: string (nullable = true)
 |-- monthly_payments: double (nullable = true)



In [11]:
#create schema for the results table
from pyspark.sql.types import StructType,StructField, StringType, DecimalType, IntegerType, DateType
schema = StructType([
  StructField('Id', StringType(), False),
  StructField('Interest Rate', DecimalType(), False),
  StructField('Reset Frequency', IntegerType(), False),
    StructField('Remaining Notional', DecimalType(), False),
    StructField('Risk Indicator', IntegerType(), False),
    StructField('Next Reset Date', DateType(), False),
    StructField('monthly_payment', DecimalType(), False)
  ])

In [12]:
payment_projection = spark.createDataFrame([], schema)
payment_projection.printSchema()

root
 |-- Id: string (nullable = false)
 |-- Interest Rate: decimal(10,0) (nullable = false)
 |-- Reset Frequency: integer (nullable = false)
 |-- Remaining Notional: decimal(10,0) (nullable = false)
 |-- Risk Indicator: integer (nullable = false)
 |-- Next Reset Date: date (nullable = false)
 |-- monthly_payment: decimal(10,0) (nullable = false)

