In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from PIL import Image
from IPython.display import display
import seaborn as sns
from spicy import stats
import itertools
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 50

In [None]:
def load_data(file_path):
    df = pd.read_csv(file_path, header=0, nrows= 10000000,low_memory = False, memory_map = True)
    return df

In [None]:
# --- The Hard-Code'd Rename Dictionary ---
# This dictionary explicitly maps each old name (key) to a new name (value).
# Used AI to help automate because couldn't rename with regex
def rename_flood_columns(df):
    column_rename_map = {
    'agriculturestructureindicator': 'agriculture_structure_indicator',
    'basefloodelevation': 'base_flood_elevation',
    'basementenclosurecrawlspacetype': 'basement_enclosure_crawlspace_type',
    'cancellationdateoffloodpolicy': 'cancellation_date_of_flood_policy',
    'censustract': 'census_tract',
    'condominiumindicator': 'condominium_indicator',
    'construction': 'construction',
    'countycode': 'county_code',
    'crsdiscount': 'crs_discount',
    'deductibleamountinbuildingcoverage': 'deductible_amount_in_building_coverage',
    'deductibleamountincontentscoverage': 'deductible_amount_in_contents_coverage',
    'elevatedbuildingindicator': 'elevated_building_indicator',
    'elevationcertificateindicator': 'elevation_certificate_indicator',
    'elevationdifference': 'elevation_difference',
    'federalpolicyfee': 'federal_policy_fee',
    'floodzone': 'flood_zone',
    'hfiaasurcharge': 'hfiaa_surcharge',
    'houseofworshipindicator': 'house_of_worship_indicator',
    'latitude': 'latitude',
    'locationofcontents': 'location_of_contents',
    'longitude': 'longitude',
    'lowestadjacentgrade': 'lowest_adjacent_grade',
    'lowestfloorelevation': 'lowest_floor_elevation',
    'nonprofitindicator': 'non_profit_indicator',
    'numberoffloorsininsuredbuilding': 'number_of_floors_in_insured_building',
    'obstructiontype': 'obstruction_type',
    'occupancytype': 'occupancy_type',
    'originalconstructiondate': 'original_construction_date',
    'originalnbdate': 'original_nb_date',
    'policycost': 'policy_cost',
    'policycount': 'policy_count',
    'policyeffectivedate': 'policy_effective_date',
    'policyterminationdate': 'policy_termination_date',
    'policytermindicator': 'policy_term_indicator',
    'postfirmconstructionindicator': 'post_firm_construction_indicator',
    'primaryresidenceindicator': 'primary_residence_indicator',
    'propertystate': 'property_state',
    'reportedzipcode': 'reported_zipcode',
    'ratemethod': 'rate_method',
    'regularemergencyprogramindicator': 'regular_emergency_program_indicator',
    'reportedcity': 'reported_city',
    'smallbusinessindicatorbuilding': 'small_business_indicator_building',
    'totalbuildinginsurancecoverage': 'total_building_insurance_coverage',
    'totalcontentsinsurancecoverage': 'total_contents_insurance_coverage',
    'totalinsurancepremiumofthepolicy': 'total_insurance_premium_of_the_policy'
}
# --- Apply the renaming ---
    df = flood_policies.rename(columns=column_rename_map)
    return df


In [None]:
def pascal_to_snake(pascal_string: str) -> str:
    # Convert PascalCase -> Pascal_Case
    # MyVariable -> My_Variable
    s1 = re.sub(r'([a-z])([A-Z])', r'\1_\2', pascal_string)
    # AHTTPTest -> AHTTP_Test
    s2 = re.sub(r'([A-Z])([A-Z][a-z])', r'\1_\2', s1)
    # Convert as lowercase string
    return s2.lower()


In [None]:
def flood_subset(df):
    # Subset of Flood data with most interested features
    df_subset = df[[
        'flood_zone', 
        'latitude', 
        'longitude', 
        'property_state',  
        'reported_city', 
        'elevated_building_indicator', 
        'elevation_difference', 
        'reported_zipcode', 
        'occupancy_type', 
        'cancellation_date_of_flood_policy', 
        'original_nb_date', 
        'policy_effective_date', 
        'policy_termination_date',
        'total_building_insurance_coverage', 
        'total_contents_insurance_coverage', 
        'total_insurance_premium_of_the_policy', 
        'premium_rate'
         ]]
    return df_subset

In [None]:
floodzone_types = ['high_flood_zone', 'coastal_high_flood_zone',
                   'low_moderate_flood_zone', 'undetermined_flood_zone']

In [None]:
column_names = ["total_building_insurance_coverage",
                "total_contents_insurance_coverage",
                "total_insurance_premium_of_the_policy",
                "premium_rate", 
                "property_state"]

col_plots = ["total_building_insurance_coverage",
                "total_contents_insurance_coverage",
                "total_insurance_premium_of_the_policy",
                "premium_rate", 
                "property_state"]

describe_col = ['flood_zone', 
        'property_state',  
        'reported_city',
        'cancellation_date_of_flood_policy', 
        'original_nb_date', 
        'policy_effective_date', 
        'policy_termination_date']


numeric_col = ["total_building_insurance_coverage",
                "total_contents_insurance_coverage",
                "total_insurance_premium_of_the_policy", 
                "premium_rate"]

In [None]:
rf_feature_cols = [
 'latitude',
 'longitude',
 'elevated_building_indicator',
 'elevation_difference',
 'reported_zipcode',
 'occupancy_type',
 'total_building_insurance_coverage',
 'total_contents_insurance_coverage',
 'high_flood_zone',
 'coastal_high_flood_zone',
 'low_moderate_flood_zone',
 'undetermined_flood_zone',
 'floodzone_catagory',
 'nb_year'
]