# Importing Pandas and Numpy

In [1]:
import pandas as pd
import numpy as np

# Reading the CSV document 

In [2]:
df = pd.read_csv("../data/tornado_2004_raw_data.csv")
# Add ../FolderNameHERE/CSV Name (go back one layer) when going back into the folder before it

# Pulling the head of the (First 5 rows) CSV

In [3]:
df.head()

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE
0,200412,29,1800,200412,30,1200,1182771,5430389,MONTANA,30,...,,,,,,,,Heavy snow event across southwest Montana brou...,,PDS
1,200412,29,1800,200412,30,1200,1182771,5430390,MONTANA,30,...,,,,,,,,Heavy snow event across southwest Montana brou...,,PDS
2,200412,8,1800,200412,8,1800,1182769,5430387,IDAHO,16,...,,,,,,,,A vigorous winter storm brought strong winds a...,,PDS
3,200412,19,1500,200412,19,1700,1182770,5430388,MONTANA,30,...,,,,,,,,Prefrontal winds were channeled through east t...,,PDS
4,200412,14,600,200412,14,800,1182772,5430391,MONTANA,30,...,,,,,,,,A winter storm with light snow followed by fre...,,PDS


In [4]:
df.shape

(52409, 51)

In [5]:
# Keep only tornado events
tornado_2004_df = df[df['EVENT_TYPE'].str.lower() == 'tornado'].copy()

# Drop columns that are mostly empty or irrelevant
cols_to_drop = [
    'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
    'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME',
    'CATEGORY', 'FLOOD_CAUSE', 'MAGNITUDE_TYPE',
    'BEGIN_RANGE', 'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH', 
    'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON', 'DAMAGE_PROPERTY', 
    'DAMAGE_CROPS', 'EPISODE_ID', 'STATE_FIPS', 'CZ_NAME', 'WFO', 'MAGNITUDE', 'DATA_SOURCE',
    'EPISODE_NARRATIVE','EVENT_NARRATIVE','EVENT_ID','BEGIN_DATE_TIME','CZ_TIMEZONE',
    'END_DATE_TIME','CZ_TYPE','CZ_FIPS', "END_DAY", "END_TIME", "END_YEARMONTH"
]

tornado_2004_df.drop(columns=cols_to_drop, inplace=True)

tornado_2004_df.reset_index(drop=True, inplace=True)

tornado_2004_df

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,STATE,YEAR,MONTH_NAME,EVENT_TYPE,SOURCE,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH
0,200407,24,1430,NEVADA,2004,July,Tornado,GENERAL PUBLIC,F0,0.1,200.0
1,200412,29,1210,CALIFORNIA,2004,December,Tornado,LAW ENFORCEMENT,F0,0.2,3.0
2,200412,9,538,MISSISSIPPI,2004,December,Tornado,NWS STORM SURVEY,F0,0.2,50.0
3,200412,7,1015,INDIANA,2004,December,Tornado,NWS STORM SURVEY,F0,1.5,75.0
4,200402,27,2330,HAWAII,2004,February,Tornado,NWS STORM SURVEY,F0,5.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...
1942,200412,7,237,MISSISSIPPI,2004,December,Tornado,NWS STORM SURVEY,F0,0.7,100.0
1943,200412,9,406,MISSISSIPPI,2004,December,Tornado,NWS STORM SURVEY,F1,16.0,300.0
1944,200412,9,430,MISSISSIPPI,2004,December,Tornado,NWS STORM SURVEY,F0,3.0,100.0
1945,200412,7,214,MISSISSIPPI,2004,December,Tornado,NWS STORM SURVEY,F2,1.0,100.0


# Renaming columns and dropping old names

In [6]:
tornado_2004_df['BEGIN_YEARMONTH'] = tornado_2004_df['BEGIN_YEARMONTH'].astype(str) #CONVERT TO STRING!!!!

tornado_2004_df['Year'] = tornado_2004_df['BEGIN_YEARMONTH'].str[:4].astype(int)   # USING SPLICING // first 4 digits
tornado_2004_df['Month'] = tornado_2004_df['BEGIN_YEARMONTH'].str[4:6].astype(int) # USING SPLICING // last 2 digits
tornado_2004_df.drop(columns=['BEGIN_YEARMONTH'], inplace=True) #DROP OLD COLUMN NAME

tornado_2004_df.rename(columns={
    'BEGIN_TIME':'TIME',
    'MONTH_NAME' : 'MONTH',
    'BEGIN_DAY' : "DAY"
},inplace=True)

tornado_2004_df.head()

Unnamed: 0,DAY,TIME,STATE,YEAR,MONTH,EVENT_TYPE,SOURCE,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH,Year,Month
0,24,1430,NEVADA,2004,July,Tornado,GENERAL PUBLIC,F0,0.1,200.0,2004,7
1,29,1210,CALIFORNIA,2004,December,Tornado,LAW ENFORCEMENT,F0,0.2,3.0,2004,12
2,9,538,MISSISSIPPI,2004,December,Tornado,NWS STORM SURVEY,F0,0.2,50.0,2004,12
3,7,1015,INDIANA,2004,December,Tornado,NWS STORM SURVEY,F0,1.5,75.0,2004,12
4,27,2330,HAWAII,2004,February,Tornado,NWS STORM SURVEY,F0,5.0,25.0,2004,2


# Moving The Month and Year columns to the front

In [7]:
# Move 'Month' to position 0 (first column)
month_col = tornado_2004_df.pop('Month')   # remove it temporarily
tornado_2004_df.insert(0, 'Month', month_col)  # insert at position 0
# Same thing for Month
month_col = tornado_2004_df.pop('Year')   # remove it temporarily
tornado_2004_df.insert(0, 'Year', month_col)  # insert at position 0

tornado_2004_df.head()

Unnamed: 0,Year,Month,DAY,TIME,STATE,YEAR,MONTH,EVENT_TYPE,SOURCE,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH
0,2004,7,24,1430,NEVADA,2004,July,Tornado,GENERAL PUBLIC,F0,0.1,200.0
1,2004,12,29,1210,CALIFORNIA,2004,December,Tornado,LAW ENFORCEMENT,F0,0.2,3.0
2,2004,12,9,538,MISSISSIPPI,2004,December,Tornado,NWS STORM SURVEY,F0,0.2,50.0
3,2004,12,7,1015,INDIANA,2004,December,Tornado,NWS STORM SURVEY,F0,1.5,75.0
4,2004,2,27,2330,HAWAII,2004,February,Tornado,NWS STORM SURVEY,F0,5.0,25.0


# Changing the time to a format more readable, making sure it's a 4 digit values and adding a colon

In [8]:
tornado_2004_df['TIME'] = tornado_2004_df['TIME'].astype(str).str.zfill(4) # Make sure 4 digit value
tornado_2004_df['TIME'] = tornado_2004_df['TIME'].str[:2] + ':' + tornado_2004_df['TIME'].str[2:] #Adding a colon
tornado_2004_df['TIME'] = pd.to_datetime(tornado_2004_df['TIME'], format='%H:%M')


tornado_2004_df.head()

Unnamed: 0,Year,Month,DAY,TIME,STATE,YEAR,MONTH,EVENT_TYPE,SOURCE,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH
0,2004,7,24,1900-01-01 14:30:00,NEVADA,2004,July,Tornado,GENERAL PUBLIC,F0,0.1,200.0
1,2004,12,29,1900-01-01 12:10:00,CALIFORNIA,2004,December,Tornado,LAW ENFORCEMENT,F0,0.2,3.0
2,2004,12,9,1900-01-01 05:38:00,MISSISSIPPI,2004,December,Tornado,NWS STORM SURVEY,F0,0.2,50.0
3,2004,12,7,1900-01-01 10:15:00,INDIANA,2004,December,Tornado,NWS STORM SURVEY,F0,1.5,75.0
4,2004,2,27,1900-01-01 23:30:00,HAWAII,2004,February,Tornado,NWS STORM SURVEY,F0,5.0,25.0


# Checking for null values summary (mean)

In [9]:
tornado_2004_df.isnull().mean()

Year           0.0
Month          0.0
DAY            0.0
TIME           0.0
STATE          0.0
YEAR           0.0
MONTH          0.0
EVENT_TYPE     0.0
SOURCE         0.0
TOR_F_SCALE    0.0
TOR_LENGTH     0.0
TOR_WIDTH      0.0
dtype: float64

# Checking for duplicate rows

In [10]:
tornado_2004_df[tornado_2004_df.duplicated()]
#10 DUPLICATES RETURNED
#All of the duplicates look to be different tornadoes, so not dropping any rows

Unnamed: 0,Year,Month,DAY,TIME,STATE,YEAR,MONTH,EVENT_TYPE,SOURCE,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH
25,2004,12,29,1900-01-01 00:15:00,CALIFORNIA,2004,December,Tornado,NWS STORM SURVEY,F0,0.5,5.0
374,2004,5,19,1900-01-01 17:40:00,NORTH DAKOTA,2004,May,Tornado,EMERGENCY MANAGER,F0,0.1,25.0
733,2004,5,29,1900-01-01 18:20:00,KANSAS,2004,May,Tornado,EMERGENCY MANAGER,F0,1.0,50.0
864,2004,5,29,1900-01-01 15:55:00,NEBRASKA,2004,May,Tornado,TRAINED SPOTTER,F0,0.2,10.0
964,2004,9,7,1900-01-01 08:48:00,SOUTH CAROLINA,2004,September,Tornado,NWS STORM SURVEY,F0,0.5,100.0
970,2004,10,4,1900-01-01 15:44:00,COLORADO,2004,October,Tornado,LAW ENFORCEMENT,F1,0.1,50.0
1105,2004,10,4,1900-01-01 15:28:00,COLORADO,2004,October,Tornado,GOVT OFFICIAL,F0,0.1,50.0
1481,2004,8,25,1900-01-01 12:59:00,IDAHO,2004,August,Tornado,TRAINED SPOTTER,F0,0.0,16.0
1629,2004,11,24,1900-01-01 00:33:00,MISSISSIPPI,2004,November,Tornado,NWS STORM SURVEY,F0,0.5,30.0
1713,2004,7,11,1900-01-01 16:13:00,WISCONSIN,2004,July,Tornado,AMATEUR RADIO,F0,0.1,10.0


In [11]:
tornado_2004_df.head() #Looking at dataset after dropping and renaming rows

Unnamed: 0,Year,Month,DAY,TIME,STATE,YEAR,MONTH,EVENT_TYPE,SOURCE,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH
0,2004,7,24,1900-01-01 14:30:00,NEVADA,2004,July,Tornado,GENERAL PUBLIC,F0,0.1,200.0
1,2004,12,29,1900-01-01 12:10:00,CALIFORNIA,2004,December,Tornado,LAW ENFORCEMENT,F0,0.2,3.0
2,2004,12,9,1900-01-01 05:38:00,MISSISSIPPI,2004,December,Tornado,NWS STORM SURVEY,F0,0.2,50.0
3,2004,12,7,1900-01-01 10:15:00,INDIANA,2004,December,Tornado,NWS STORM SURVEY,F0,1.5,75.0
4,2004,2,27,1900-01-01 23:30:00,HAWAII,2004,February,Tornado,NWS STORM SURVEY,F0,5.0,25.0


In [12]:
tornado_2004_df.dtypes #Checking value types

Year                    int64
Month                   int64
DAY                     int64
TIME           datetime64[ns]
STATE                  object
YEAR                    int64
MONTH                  object
EVENT_TYPE             object
SOURCE                 object
TOR_F_SCALE            object
TOR_LENGTH            float64
TOR_WIDTH             float64
dtype: object

# Changing objects to strings

In [13]:
tornado_2004_df['STATE'] = tornado_2004_df['STATE'].astype('string')
tornado_2004_df['MONTH'] = tornado_2004_df['MONTH'].astype('string')
tornado_2004_df['EVENT_TYPE'] = tornado_2004_df['EVENT_TYPE'].astype('string')
tornado_2004_df['SOURCE'] = tornado_2004_df['SOURCE'].astype('string')
tornado_2004_df['TOR_F_SCALE'] = tornado_2004_df['TOR_F_SCALE'].astype('string')
tornado_2004_df['MONTH'] = tornado_2004_df['MONTH'].astype('string')

tornado_2004_df.head()

Unnamed: 0,Year,Month,DAY,TIME,STATE,YEAR,MONTH,EVENT_TYPE,SOURCE,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH
0,2004,7,24,1900-01-01 14:30:00,NEVADA,2004,July,Tornado,GENERAL PUBLIC,F0,0.1,200.0
1,2004,12,29,1900-01-01 12:10:00,CALIFORNIA,2004,December,Tornado,LAW ENFORCEMENT,F0,0.2,3.0
2,2004,12,9,1900-01-01 05:38:00,MISSISSIPPI,2004,December,Tornado,NWS STORM SURVEY,F0,0.2,50.0
3,2004,12,7,1900-01-01 10:15:00,INDIANA,2004,December,Tornado,NWS STORM SURVEY,F0,1.5,75.0
4,2004,2,27,1900-01-01 23:30:00,HAWAII,2004,February,Tornado,NWS STORM SURVEY,F0,5.0,25.0


# Creating NEW CSV DOCUMENT with only 2004 data. Will combine....

In [14]:
tornado_2004_df.to_csv("../data/tornado_2004_cleaned.csv", index=False)

