In [26]:
import numpy as np
import pyspark
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql.functions import col
from pyspark.sql.types import DateType, IntegerType, LongType, DoubleType
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [2]:
sc = pyspark.SparkContext('local[*]')
spark = pyspark.sql.SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/23 16:49:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [44]:
df = spark.read.parquet('../data/02_intermediate/nfip-flood-policies.parquet', inferSchema=True)

In [45]:
df = df.drop(*['policyeffectivedate', 'policyterminationdate', 'policytermindicator'])

In [46]:
df.printSchema()

root
 |-- agriculturestructureindicator: string (nullable = true)
 |-- basementenclosurecrawlspacetype: string (nullable = true)
 |-- censustract: string (nullable = true)
 |-- condominiumindicator: string (nullable = true)
 |-- construction: string (nullable = true)
 |-- countycode: string (nullable = true)
 |-- crsdiscount: string (nullable = true)
 |-- deductibleamountinbuildingcoverage: string (nullable = true)
 |-- elevatedbuildingindicator: string (nullable = true)
 |-- elevationdifference: string (nullable = true)
 |-- federalpolicyfee: string (nullable = true)
 |-- floodzone: string (nullable = true)
 |-- hfiaasurcharge: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- numberoffloorsininsuredbuilding: string (nullable = true)
 |-- occupancytype: string (nullable = true)
 |-- originalconstructiondate: string (nullable = true)
 |-- originalnbdate: string (nullable = true)
 |-- policycost: string (nullable = true)
 |-- 

In [47]:
df = df.withColumn('censustract', col('censustract').cast(LongType()))\
	.withColumn('crsdiscount', col('crsdiscount').cast(DoubleType())) \
	.withColumn('elevationdifference', col('elevationdifference').cast(IntegerType())) \
	.withColumn('federalpolicyfee', col('federalpolicyfee').cast(IntegerType())) \
	.withColumn('hfiaasurcharge', col('hfiaasurcharge').cast(IntegerType())) \
	.withColumn('latitude', col('latitude').cast(DoubleType())) \
	.withColumn('longitude', col('longitude').cast(DoubleType())) \
	.withColumn('numberoffloorsininsuredbuilding', col('numberoffloorsininsuredbuilding').cast(IntegerType())) \
	.withColumn('originalconstructiondate', col('originalconstructiondate').cast(DateType())) \
	.withColumn('originalnbdate', col('originalnbdate').cast(DateType())) \
	.withColumn('policycost', col('policycost').cast(IntegerType())) \
	.withColumn('policycount', col('policycount').cast(IntegerType())) \
	.withColumn('totalbuildinginsurancecoverage', col('totalbuildinginsurancecoverage').cast(IntegerType())) \
	.withColumn('totalcontentsinsurancecoverage', col('totalcontentsinsurancecoverage').cast(IntegerType())) \
	.withColumn('totalinsurancepremiumofthepolicy', col('totalinsurancepremiumofthepolicy').cast(IntegerType()))


In [61]:
assembler = VectorAssembler(inputCols=['censustract','crsdiscount','elevationdifference','federalpolicyfee','hfiaasurcharge','latitude','longitude','numberoffloorsininsuredbuilding','policycost','policycount'], outputCol='features')
output = assembler.transform(df)
final_df = output.select(['features', 'totalinsurancepremiumofthepolicy'])

In [67]:
train, test, validate = final_df.randomSplit([0.6, 0.2, 0.2], seed=0)

In [70]:
model = RandomForestRegressor(labelCol='totalinsurancepremiumofthepolicy')
model.setSeed(0)

RandomForestRegressor_11709a0e0153

In [71]:
model.fit(train)

[Stage 13:>                                                         (0 + 1) / 1]

                                                                                

[Stage 14:>                                                         (0 + 8) / 8]











                                                                                

[Stage 15:>                                                         (0 + 8) / 8]













                                                                                

[Stage 17:>                                                         (0 + 8) / 8]







23/04/23 18:07:58 WARN MemoryStore: Not enough space to cache rdd_53_4 in memory! (computed 61.4 MiB so far)
23/04/23 18:07:58 WARN BlockManager: Persisting block rdd_53_4 to disk instead.


23/04/23 18:08:03 WARN MemoryStore: Not enough space to cache rdd_53_3 in memory! (computed 40.6 MiB so far)
23/04/23 18:08:03 WARN BlockManager: Persisting block rdd_53_3 to disk instead.


23/04/23 18:08:04 WARN MemoryStore: Not enough space to cache rdd_53_1 in memory! (computed 40.6 MiB so far)
23/04/23 18:08:04 WARN BlockManager: Persisting block rdd_53_1 to disk instead.


23/04/23 18:08:09 WARN MemoryStore: Not enough space to cache rdd_53_0 in memory! (computed 40.6 MiB so far)
23/04/23 18:08:09 WARN BlockManager: Persisting block rdd_53_0 to disk instead.


23/04/23 18:08:11 WARN MemoryStore: Not enough space to cache rdd_53_2 in memory! (computed 211.0 MiB so far)
23/04/23 18:08:11 WARN BlockManager: Persisting block rdd_53_2 to disk instead.


23/04/23 18:08:24 WARN MemoryStore: Not enough space to cache rdd_53_2 in memory! (computed 140.7 MiB so far)
23/04/23 18:08:24 WARN MemoryStore: Not enough space to cache rdd_53_4 in memory! (computed 140.7 MiB so far)






23/04/23 18:08:36 WARN MemoryStore: Not enough space to cache rdd_53_1 in memory! (computed 93.8 MiB so far)




23/04/23 18:08:41 WARN MemoryStore: Not enough space to cache rdd_53_0 in memory! (computed 140.7 MiB so far)




                                                                                



23/04/23 18:08:52 WARN MemoryStore: Not enough space to cache rdd_53_2 in memory! (computed 27.1 MiB so far)
23/04/23 18:08:52 WARN MemoryStore: Not enough space to cache rdd_53_1 in memory! (computed 27.1 MiB so far)
23/04/23 18:08:52 WARN MemoryStore: Not enough space to cache rdd_53_4 in memory! (computed 40.6 MiB so far)


23/04/23 18:08:52 WARN MemoryStore: Not enough space to cache rdd_53_0 in memory! (computed 40.6 MiB so far)












                                                                                



23/04/23 18:09:09 WARN MemoryStore: Not enough space to cache rdd_53_0 in memory! (computed 27.1 MiB so far)
23/04/23 18:09:09 WARN MemoryStore: Not enough space to cache rdd_53_2 in memory! (computed 27.1 MiB so far)


23/04/23 18:09:09 WARN MemoryStore: Not enough space to cache rdd_53_1 in memory! (computed 40.6 MiB so far)
23/04/23 18:09:09 WARN MemoryStore: Not enough space to cache rdd_53_4 in memory! (computed 40.6 MiB so far)










                                                                                



23/04/23 18:09:31 WARN MemoryStore: Not enough space to cache rdd_53_2 in memory! (computed 27.1 MiB so far)
23/04/23 18:09:32 WARN MemoryStore: Not enough space to cache rdd_53_1 in memory! (computed 27.1 MiB so far)


23/04/23 18:09:32 WARN MemoryStore: Not enough space to cache rdd_53_4 in memory! (computed 40.6 MiB so far)
23/04/23 18:09:32 WARN MemoryStore: Not enough space to cache rdd_53_0 in memory! (computed 40.6 MiB so far)




ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/jerome/misc/projects/programming/basic-venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/jerome/misc/projects/programming/basic-venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
predictions = model.transform(test)
predictions.write.save('predictions.csv', mode='overwrite')