In [20]:
import pandas as pd
import regex as re
from geopy import distance

This notebook combines Census, FEMA and HURDAT2 data into a final_merge.csv for supervised learning.

In [21]:
census_df=pd.read_csv('census_data.csv')
hurr_df=pd.read_csv('hurricanes_data.csv')
fema_df=pd.read_csv('fema_final.csv')

In [22]:
#Create new column for fipsCode. Change data types from float to int or string as appropriate
census_df['FIPS']=census_df['FIPS'].astype('int')
census_df.rename(columns={'FIPS':'fipsCode','Year':'year'}, inplace=True)
census_df['year']=census_df['year'].astype('int')

In [23]:
#obtain year to facilitate merging notebooks
hurr_df['Date']=hurr_df['Date'].astype('str')
hurr_df['year']=hurr_df['Date'].str.slice(start=0,stop=4).astype('int64')
#hurr_df.dtypes

In [25]:
#filter for hurricanes in the desired time period
hurr_df_modern=hurr_df[hurr_df['year']>=2003]
hurr_df_final=hurr_df_modern[['Name','Date','Latitude','Longitude','Max Wind','Max Pressure','year']]

In [26]:
#merge FEMA with census data based on county and year
fema_census_merge=pd.merge(fema_df,census_df,how='inner',on=['fipsCode','year'])

In [27]:
#exploratory analysis to see how many times a hurricane made landfall
hurr_count=hurr_df_final.groupby('Name').agg(landfalls=('year','count'))

In [28]:
#cleaning up blank space from hurricane names
hurr_df_final['Name']=hurr_df_final['Name'].str.lstrip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hurr_df_final['Name']=hurr_df_final['Name'].str.lstrip()


In [29]:
#obtain storm names from the FEMA disaster declarations
fema_census_merge['Name']=fema_census_merge['declarationTitle'].str.split().str[-1]

In [30]:
#merge FEMA data with hurricane data
fema_cen=fema_census_merge.copy()
hurr_final=hurr_df_final.copy()
final_merge=fema_cen.merge(hurr_final, how='inner', on=['Name','year'])

In [31]:
len(final_merge)

1730

In [32]:
#add latitude and longitude of landfall to the dataset
final_merge['Landfall_Lat']=final_merge['Latitude'].str[:-1]
final_merge['Landfall_Lon']='-'+final_merge['Longitude'].str.lstrip().str[:-1]

In [33]:
#calculate distance from landfall for each affected county
final_merge['dist_from_landfall']=None
for i in range(len(final_merge)):
    tup1=(final_merge['LATITUDE_county'].iloc[i],final_merge['LONGITUDE_county'].iloc[i])
    tup2=(final_merge['Landfall_Lat'].iloc[i],final_merge['Landfall_Lon'].iloc[i])
    final_merge['dist_from_landfall'].iloc[i]=distance.distance(tup1,tup2).miles

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  final_merge['dist_from_landfall'].iloc[i]=distance.distance(tup1,tup2).miles
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

In [34]:
#final dataframe without clusters or closest pass added
final_merge.to_csv('final_merge.csv',index='False')