In [37]:
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
import requests
from bs4 import BeautifulSoup

%matplotlib inline

In [36]:
# To avoid display of warnings in Jupyter Notebook:
import warnings
warnings.filterwarnings('ignore')

In [32]:
link = "https://www1.ncdc.noaa.gov/pub/data/swdi/stormevents/csvfiles/"
response = requests.get(link)
html = response.text

In [34]:
soup = BeautifulSoup(html)
elements = soup.findAll("a",{"class":""})

In [35]:

StormEventDetails_allyears = []

# The first 6 elements and the last 2 have to be discarded:
for element in elements[6:-2]:
    
    # Selecting only the storm events tables (one for each year)
    if element.attrs['href'].startswith('StormEvents_details'):
        
        filename = element.attrs['href']
        StormEventDetails_url = link + filename
        
        # Creating an iterator in order to load the file in chunks of 1000 elements:
        iter_csv = pd.read_csv(StormEventDetails_url, compression='gzip', iterator=True,
                               chunksize=1000)
        
        # Concatenating the different chunks into a single dataframe, 
        # selecting the tornado events only:
        StormEventDetails_allyears.append(
            pd.concat([chunk[chunk['EVENT_TYPE'].map(lambda x: x.lower())
                             == 'tornado'] for chunk in iter_csv]))

# Concatenating all the dataframes from the different years:
StormEventDetails = pd.concat(StormEventDetails_allyears)
        

StormEventDetails.head()


Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE
0,195004,28,1445,195004,28,1445,,10096222,OKLAHOMA,40.0,...,0.0,,,35.12,-99.2,35.17,-99.2,,,PUB
1,195004,29,1530,195004,29,1530,,10120412,TEXAS,48.0,...,0.0,,,31.9,-98.6,31.73,-98.6,,,PUB
2,195007,5,1800,195007,5,1800,,10104927,PENNSYLVANIA,42.0,...,0.0,,,40.58,-75.7,40.65,-75.47,,,PUB
3,195007,5,1830,195007,5,1830,,10104928,PENNSYLVANIA,42.0,...,0.0,,,40.6,-76.75,,,,,PUB
4,195007,24,1440,195007,24,1440,,10104929,PENNSYLVANIA,42.0,...,0.0,,,41.63,-79.68,,,,,PUB


In [39]:
StormEventFatalities_allyears = []
StormEventLocations_allyears = []

for element in elements[6:-2]:
    if element.attrs['href'].startswith('StormEvents_fatalities'):
        filename = element.attrs['href']
        StormEventFatalities_url = link+filename
        iter_csv = pd.read_csv(StormEventFatalities_url, compression='gzip', iterator=True,
                               chunksize=1000)
        StormEventFatalities_allyears.append(pd.concat(iter_csv))

    elif element.attrs['href'].startswith('StormEvents_locations'):
        filename = element.attrs['href']
        StormEventLocation_url = link+filename
        iter_csv = pd.read_csv(StormEventLocation_url, compression='gzip', iterator=True,
                               chunksize=1000)
        StormEventLocations_allyears.append(pd.concat(iter_csv))
        
StormEventFatalities = pd.concat(StormEventFatalities_allyears)
StormEventLocations = pd.concat(StormEventLocations_allyears)

StormEventFatalities.head()

Unnamed: 0,FAT_YEARMONTH,FAT_DAY,FAT_TIME,FATALITY_ID,EVENT_ID,FATALITY_TYPE,FATALITY_DATE,FATALITY_AGE,FATALITY_SEX,FATALITY_LOCATION,EVENT_YEARMONTH
0,195001,13,525,1005198,9981922,D,01/13/1950 05:25:00,,,,195001.0
1,195002,12,1200,1005199,10049525,D,02/12/1950 12:00:00,,,,195002.0
2,195002,11,1350,1005200,10120403,D,02/11/1950 13:50:00,,,,195002.0
3,195002,12,30,1005201,10120406,D,02/12/1950 00:30:00,,,,195002.0
4,195002,12,1200,1005202,10120410,D,02/12/1950 12:00:00,,,,195002.0


In [41]:
StormEventLocations.head()

Unnamed: 0,YEARMONTH,EPISODE_ID,EVENT_ID,LOCATION_INDEX,RANGE,AZIMUTH,LOCATION,LATITUDE,LONGITUDE,LAT2,LON2
0,197206,990000001,990000001,1,,,LABELLE,26.77,-81.48,2677.0,-8148.0
1,197206,990000001,990000001,2,,,LABELLE,26.78,-81.48,2678.0,-8148.0
0,199603,2030059,5548852,1,,,LANGLEY,34.32,-93.83,3419.0,9350.0
1,199603,2030060,5548853,1,2.0,S,YELLVILLE,36.2,-92.68,3612.0,9241.0
2,199603,1002564,5548854,1,,,COTTER,36.27,-92.53,3616.0,9232.0


In [77]:
# Creating engine connection to my local "storms" database, using sqlalchemy:

engine_local = create_engine('postgres://localhost:5432/storms')



In [78]:
# Copying the first dataframe to local PostgreSQL:
StormEventDetails.to_sql('tornadoes_1950_mid2017', engine_local, index=False,
                         if_exists='replace')

In [79]:
# Copying the second dataframe
StormEventFatalities.to_sql('fatalities_1950_mid2017', engine_local, index=False,
                            if_exists='replace')

In [80]:
# Copying the third dataframe
StormEventLocations.to_sql('locations_1950_mid2017', engine_local, index=False,
                           if_exists='replace')