In [129]:
import pandas as pd
import regex as re
from geopy import distance

This notebook combines Census, FEMA and HURDAT2 data into a final csv for supervised learning.

In [131]:
census_df=pd.read_csv('census_data.csv')
hurr_df=pd.read_csv('hurricanes_data.csv')
fema_df=pd.read_csv('fema_final.csv')

In [133]:
#Create new column for fipsCode. Change data types from float to int or string as appropriate
census_df['FIPS']=census_df['FIPS'].astype('int')
census_df.rename(columns={'FIPS':'fipsCode','Year':'year'}, inplace=True)
census_df['year']=census_df['year'].astype('int')

In [135]:
#obtain year to facilitate merging notebooks
hurr_df['Date']=hurr_df['Date'].astype('str')
hurr_df['year']=hurr_df['Date'].str.slice(start=0,stop=4).astype('int64')
#hurr_df.dtypes

In [137]:
#filter for hurricanes in the desired time period
hurr_df_modern=hurr_df[hurr_df['year']>=2003]
hurr_df_final=hurr_df_modern[['Name','Date','Latitude','Longitude','Max Wind','Min Pressure','year']]

In [139]:
#merge FEMA with census data based on county and year
fema_census_merge=pd.merge(fema_df,census_df,how='inner',on=['fipsCode','year'])

In [143]:
#exploratory analysis to see how many times a hurricane made landfall
hurr_count=hurr_df_final.groupby('Name').agg(landfalls=('year','count'))

In [145]:
#cleaning up blank space from hurricane names
hurr_df_final['Name']=hurr_df_final['Name'].str.lstrip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hurr_df_final['Name']=hurr_df_final['Name'].str.lstrip()


In [147]:
#obtain storm names from the FEMA disaster declarations
fema_census_merge['Name']=fema_census_merge['declarationTitle'].str.split().str[-1]

In [149]:
#merge FEMA data with hurricane data
final_merge=pd.merge(fema_census_merge, hurr_df_final, how='inner', on=['Name','year'])

In [151]:
#add latitude and longitude of landfall to the dataset
final_merge['Landfall_Lat']=final_merge['Latitude'].str[:-1]
final_merge['Landfall_Lon']='-'+final_merge['Longitude'].str.lstrip().str[:-1]

In [153]:
#calculate distance from landfall for each affected county
final_merge['dist_from_landfall']=None
for i in range(len(final_merge)):
    tup1=(final_merge['LATITUDE_county'].iloc[i],final_merge['LONGITUDE_county'].iloc[i])
    tup2=(final_merge['Landfall_Lat'].iloc[i],final_merge['Landfall_Lon'].iloc[i])
    final_merge['dist_from_landfall'].iloc[i]=distance.distance(tup1,tup2).miles

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_merge['dist_from_landfall'].iloc[i]=distance.distance(tup1,tup2).miles
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_merge['dist_from_landfall'].iloc[i]=distance.distance(tup1,tup2).miles
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_merge['dist_from_landfall'].iloc[i]=distance.distance(tup1,tup2).miles
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

In [39]:
#original final dataframe without clusters added
#final_merge.to_csv('final_merge.csv',index='False')

In [155]:
#import clusters from unsupervised learning and the closest pass df.  This was done after our original final dataframe proved insufficient
cluster_df=pd.read_csv('clusters_n_2.csv')
closest_pass_df=pd.read_csv('closest_pass.csv')

In [159]:
closest_pass_df.rename(columns={'Max Wind':'Max Wind closest','Max Pressure':'Max Pressure closest',
                                'distance_from_storm':'closest_pass'},inplace=True)

In [161]:
#ensure observations have not been duplicated
cluster_df.drop_duplicates(subset=['fipsCode'],inplace=True)
len(cluster_df)

741

In [165]:
final_merge_1=final_merge.merge(closest_pass_df,how='inner',on=['Name','year','fipsCode'])

In [167]:
#use left join since economic cluster does not apply to every county
final_merge_2=final_merge_1.merge(cluster_df, how='left',on=['fipsCode'])

In [171]:
#create dataframe for analysis
analysis_df=final_merge_2[[
 'DamageBetween20001and3000',
 'DamageGreaterThan3000',
 'MajorDamage_rent',
 'Estimate!!HOUSING OCCUPANCY!!Total housing units',
 'Percent!!HOUSING OCCUPANCY!!Total housing units',
 'Estimate!!HOUSING OCCUPANCY!!Total housing units!!Occupied housing units',
 'Percent!!HOUSING OCCUPANCY!!Total housing units!!Occupied housing units',
 'Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 2020 or later',
 'Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 2020 or later',
 'Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 2010 to 2019',
 'Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 2010 to 2019',
 'Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 2000 to 2009',
 'Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 2000 to 2009',
 'Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1990 to 1999',
 'Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1990 to 1999',
 'Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1980 to 1989',
 'Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1980 to 1989',
 'Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1970 to 1979',
 'Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1970 to 1979',
 'Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1960 to 1969',
 'Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1960 to 1969',
 'Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1950 to 1959',
 'Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1950 to 1959',
 'Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1940 to 1949',
 'Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1940 to 1949',
 'Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1939 or earlier',
 'Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1939 or earlier',
 'LATITUDE_county',
 'LONGITUDE_county',
 'Landfall_Lat',
 'Landfall_Lon',
 'Max Wind',
 'Min Pressure','Max Pressure closest','Max Wind closest','category','closest_pass', 'housing_cluster','econ_cluster']]

In [173]:
#calculate distance from landfall
analysis_df['dist_from_landfall']=None
for i in range(len(analysis_df)):
    tup1=(analysis_df['LATITUDE_county'].iloc[i],analysis_df['LONGITUDE_county'].iloc[i])
    tup2=(analysis_df['Landfall_Lat'].iloc[i],analysis_df['Landfall_Lon'].iloc[i])
    analysis_df['dist_from_landfall'].iloc[i]=distance.distance(tup1,tup2).miles
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analysis_df['dist_from_landfall']=None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analysis_df['dist_from_landfall'].iloc[i]=distance.distance(tup1,tup2).miles
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analysis_df['dist_from_landfall'].iloc[i]=distance.distance(tup1,tup2).miles
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata

In [179]:
#drop county latitude and longitude for center of population as well as landfall latitude and longitude
#these will not be needed for supervised learning.
analysis_df.drop(columns=['LATITUDE_county',
 'LONGITUDE_county',
 'Landfall_Lat',
 'Landfall_Lon',],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analysis_df.drop(columns=['LATITUDE_county',


In [185]:
#new analysis df with clusters and closest pass information
analysis_df.to_csv('analysis_df_v2.csv',index=False)