Cleaning of a dataset Tornados in the United States from 1950-2023

Dataset was found at - https://www.spc.noaa.gov/wcm/#data

In [104]:
#install requirements
# %pip install pipenv
# %pip install requests
# %pip install pandas
# %pip install numpy
# %pip install datetime

In [105]:
#import required packages
import pandas as pd
from datetime import datetime

In [106]:
# Load data from CSV file
df = pd.read_csv('1950-2023_all_tornadoes.csv')


In [107]:
#inspect the dataset
#df.head(3)
#df.tail(3)
#df.describe()
#df.columns
#df.info()

In [108]:
# Variables
# timestamp variable
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

In [109]:
#rename column name
# Define the new column names
new_column_names = [
    "om", "year", "month", "day", "date", "time", "timezone", "state",
    "state_fips", "ytd-for-state", "ef-scale", "injuries", "fatatalies",
    "property_loss-millions", "crop_loss-millions", "start_latitude", "start_longitude",
    "end_latitude", "end_longitude", "track-length-in-miles", "width-in-yards",
    "number-of-states-in-track", "state-number-in-track", "segment-number",
    "1st-county-fips-code", "2nd-county-fips code", "3rd-county-fips-code",
    "4th-county-fips-code", "wind-only-reading"
]

# Rename the columns
df.columns = new_column_names

In [110]:
'''
*** Save the updated DataFrame back to a new CSV file (optional) ***

Note: I chose this name for the output file to avoid overwriting the original file and to not be
located near the original file in the same directory.

'''
# Uncomment the following lines to save the dataframe as a CSV file with a timestamp
# output_file_path = "tornado-data-columns-renamed-"+timestamp+".csv"

# Uncomment the following line to save the updated DataFrame to a CSV file without timestamp
output_file_path = "tornado-data-columns-renamed.csv"

# mode set to 'w' to overwrite the file if it exists
# Uncomment the following line to save the DataFrame to a CSV file
#df.to_csv(output_file_path, index=False, mode='w')

# Uncomment the following line to print the confirmation message
#print(f"Columns renamed and saved to {output_file_path}")

In [111]:
'''
Removed columns that were not relevant for this analysis.

'''
columns_to_remove = [
    "1st-county-fips-code", "2nd-county-fips code", 
    "3rd-county-fips-code", "4th-county-fips-code", 
    "wind-only-reading", "om", "state_fips", "ytd-for-state"
]

df = df.drop(columns=columns_to_remove)

In [112]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df):
    # Replace all instances of -9 with -1 in column: 'ef-scale'
    df.loc[df['ef-scale'] == -9, 'ef-scale'] = -1
    return df

df = clean_data(df.copy())
#df.head()

In [113]:
'''
Save to CSV file for further analysis
Note: I chose this name for the output file to avoid overwriting the original file and to not be
located near the original file in the same directory.
'''
# Uncomment the following line to save the updated DataFrame to a time stamped CSV file
# output_file_path = "tornado-data-columns-removed-" + timestamp + ".csv"

# Uncomment the following line to save the updated DataFrame to a CSV file without timestamp
output_file_path = "tornado-data-columns-removed.csv"

# mode set to 'w' to overwrite the file if it exists
df.to_csv(output_file_path, index=False, mode='w')

# Uncomment the following line to print the confirmation message
#print(f"Columns renamed and saved to {output_file_path}")

In [114]:
# inspect the final dataset, that will next be used for further analysis
df.head()

Unnamed: 0,year,month,day,date,time,timezone,state,ef-scale,injuries,fatatalies,...,crop_loss-millions,start_latitude,start_longitude,end_latitude,end_longitude,track-length-in-miles,width-in-yards,number-of-states-in-track,state-number-in-track,segment-number
0,1950,10,1,1950-10-01,21:00:00,3,OK,1,0,0,...,0.0,36.73,-102.52,36.88,-102.3,15.8,10,1,1,1
1,1950,10,9,1950-10-09,02:15:00,3,NC,3,3,0,...,0.0,34.17,-78.6,0.0,0.0,2.0,880,1,1,1
2,1950,11,20,1950-11-20,02:20:00,3,KY,2,0,0,...,0.0,37.37,-87.2,0.0,0.0,0.1,10,1,1,1
3,1950,11,20,1950-11-20,04:00:00,3,KY,1,0,0,...,0.0,38.2,-84.5,0.0,0.0,0.1,10,1,1,1
4,1950,11,20,1950-11-20,07:30:00,3,MS,1,3,0,...,0.0,32.42,-89.13,0.0,0.0,2.0,37,1,1,1
