In [26]:
import numpy as np
import pyspark
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql.functions import col
from pyspark.sql.types import DateType, IntegerType, LongType, DoubleType
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [2]:
sc = pyspark.SparkContext('local[*]')
spark = pyspark.sql.SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/23 16:49:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [44]:
df = spark.read.parquet('../data/02_intermediate/nfip-flood-policies.parquet', inferSchema=True)

In [45]:
df = df.drop(*['policyeffectivedate', 'policyterminationdate', 'policytermindicator'])

In [46]:
df.printSchema()

root
 |-- agriculturestructureindicator: string (nullable = true)
 |-- basementenclosurecrawlspacetype: string (nullable = true)
 |-- censustract: string (nullable = true)
 |-- condominiumindicator: string (nullable = true)
 |-- construction: string (nullable = true)
 |-- countycode: string (nullable = true)
 |-- crsdiscount: string (nullable = true)
 |-- deductibleamountinbuildingcoverage: string (nullable = true)
 |-- elevatedbuildingindicator: string (nullable = true)
 |-- elevationdifference: string (nullable = true)
 |-- federalpolicyfee: string (nullable = true)
 |-- floodzone: string (nullable = true)
 |-- hfiaasurcharge: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- numberoffloorsininsuredbuilding: string (nullable = true)
 |-- occupancytype: string (nullable = true)
 |-- originalconstructiondate: string (nullable = true)
 |-- originalnbdate: string (nullable = true)
 |-- policycost: string (nullable = true)
 |-- 

In [47]:
df = df.withColumn('censustract', col('censustract').cast(LongType()))\
	.withColumn('crsdiscount', col('crsdiscount').cast(DoubleType())) \
	.withColumn('elevationdifference', col('elevationdifference').cast(IntegerType())) \
	.withColumn('federalpolicyfee', col('federalpolicyfee').cast(IntegerType())) \
	.withColumn('hfiaasurcharge', col('hfiaasurcharge').cast(IntegerType())) \
	.withColumn('latitude', col('latitude').cast(DoubleType())) \
	.withColumn('longitude', col('longitude').cast(DoubleType())) \
	.withColumn('numberoffloorsininsuredbuilding', col('numberoffloorsininsuredbuilding').cast(IntegerType())) \
	.withColumn('originalconstructiondate', col('originalconstructiondate').cast(DateType())) \
	.withColumn('originalnbdate', col('originalnbdate').cast(DateType())) \
	.withColumn('policycost', col('policycost').cast(IntegerType())) \
	.withColumn('policycount', col('policycount').cast(IntegerType())) \
	.withColumn('totalbuildinginsurancecoverage', col('totalbuildinginsurancecoverage').cast(IntegerType())) \
	.withColumn('totalcontentsinsurancecoverage', col('totalcontentsinsurancecoverage').cast(IntegerType())) \
	.withColumn('totalinsurancepremiumofthepolicy', col('totalinsurancepremiumofthepolicy').cast(IntegerType()))


In [61]:
assembler = VectorAssembler(inputCols=['censustract','crsdiscount','elevationdifference','federalpolicyfee','hfiaasurcharge','latitude','longitude','numberoffloorsininsuredbuilding','policycost','policycount'], outputCol='features')
output = assembler.transform(df)
final_df = output.select(['features', 'totalinsurancepremiumofthepolicy'])

In [67]:
train, test, validate = final_df.randomSplit([0.6, 0.2, 0.2], seed=0)

In [70]:
model = RandomForestRegressor(labelCol='totalinsurancepremiumofthepolicy')
model.setSeed(0)

RandomForestRegressor_11709a0e0153

In [None]:
model.fit(train)

[Stage 13:>                                                         (0 + 1) / 1]