#### Data: https://public.tableau.com/s/sites/default/files/media/faa_data_subset.xlsx

#### we are looking to explore hdf5 with wildlife strikes dataset

In [1]:
import pandas as pd
import h5py

  from ._conv import register_converters as _register_converters


In [2]:
# read the xls file with the sheetname
sheets = pd.ExcelFile('faa_data_subset.xlsx')
sheets.sheet_names

[u'FAA Wildlife Strikes']

In [3]:
# look at top 3 rows
df = pd.read_excel('faa_data_subset.xlsx', sheet='FAA Wildlife Strikes')
df.head(3)

Unnamed: 0,Airport: Code,Airport: Name,Origin State,Origin State Code,Country,Aircraft: Type,Aircraft: Number of engines,Collision Date and Time,When: Time of day,When: Phase of flight,...,Days,Feet above ground,Miles from airport,Wildlife: Animal Category,Wildlife: Species Order,Wildlife: Species Group,Wildlife: Species,Wildlife: Species ID,Number of Strikes,Record ID
0,KAAF,APALACHICOLA REGIONAL ARPT,Florida,FL,United States,,,2012-09-20 19:30:00,,Take-off run,...,,0.0,,Terrestrial Mammals,Canids,"Wolves, Dogs, Foxes, Coyote",Domestic dog,1F12,1,17459
1,KAAF,APALACHICOLA REGIONAL ARPT,Florida,FL,United States,Airplane,1.0,2013-04-23 17:09:00,,Take-off run,...,,,,Birds,"Pelicans, Herons, Egrets, Bitterns, Ibises","Herons, Egrets, Bitterns","Herons, egrets, bitterns",I1,1,17114
2,KABE,LEHIGH VALLEY INTL,Pennsylvania,PA,United States,Airplane,2.0,2009-04-23 09:22:00,Day,Take-off run,...,30.0,0.0,0.0,Birds,"Caracaras, Falcons","Caracaras, Falcons",American kestrel,K5114,1,259361


In [4]:
# look at all the columns, as there are 25 columns we will be able to drop some columns which we don't require
df.columns

Index([u'Airport: Code', u'Airport: Name', u'Origin State',
       u'Origin State Code', u'Country', u'Aircraft: Type',
       u'Aircraft: Number of engines', u'Collision Date and Time',
       u'When: Time of day', u'When: Phase of flight',
       u'Effect: Amount of damage (detailed)', u'Effect: Impact to flight',
       u'Effect: Indicated Damage',
       u'Cost: Aircraft time out of service (hours)', u'Cost: Total $',
       u'Days', u'Feet above ground', u'Miles from airport',
       u'Wildlife: Animal Category', u'Wildlife: Species Order',
       u'Wildlife: Species Group', u'Wildlife: Species',
       u'Wildlife: Species ID', u'Number of Strikes', u'Record ID'],
      dtype='object')

In [5]:
# time the file read 
%timeit pd.read_excel('faa_data_subset.xlsx', sheet='FAA Wildlife Strikes')

1 loop, best of 3: 7.97 s per loop


In [6]:
# look at the length of rows
len(df)

28298

##### 8 seconds it take to read the file

In [7]:
# 'Airport: Code', 'Origin State Code', 'Wildlife: Species ID', 'Record ID' columns is not needed
for column in ['Airport: Code', 'Origin State Code', 'Wildlife: Species ID', 'Record ID']:
    if column not in ['Record ID']:
        print ", ".join(df[column].values.tolist()[:10])
    else:
        print df[column].values.tolist()[:10]

KAAF, KAAF, KABE, KABE, KABE, KABE, KABE, KABE, KABE, KABE
FL, FL, PA, PA, PA, PA, PA, PA, PA, PA
1F12, I1, K5114, O2205, K3302, NE1, K3302, YM1102, O2205, K3302
[17459L, 17114L, 259361L, 345167L, 262782L, 208167L, 344633L, 4255L, 9581L, 1761L]


In [8]:
# not sure about days column too, so we can ignore this too
df['Days'].value_counts()[:1]

0.000000    653
0.041667    639
0.083333    212
1.000000    150
Name: Days, dtype: int64

In [9]:
exclude_columns = ['Airport: Code', 'Origin State Code', 'Wildlife: Species ID', 'Record ID', 'Days']
include_columns = []
for column in df.columns:
    if column not in exclude_columns:
        include_columns.append(column)

In [10]:
", ".join(include_columns)

u'Airport: Name, Origin State, Country, Aircraft: Type, Aircraft: Number of engines, Collision Date and Time, When: Time of day, When: Phase of flight, Effect: Amount of damage (detailed), Effect: Impact to flight, Effect: Indicated Damage, Cost: Aircraft time out of service (hours), Cost: Total $, Feet above ground, Miles from airport, Wildlife: Animal Category, Wildlife: Species Order, Wildlife: Species Group, Wildlife: Species, Number of Strikes'

In [11]:
df[include_columns]

Unnamed: 0,Airport: Name,Origin State,Country,Aircraft: Type,Aircraft: Number of engines,Collision Date and Time,When: Time of day,When: Phase of flight,Effect: Amount of damage (detailed),Effect: Impact to flight,Effect: Indicated Damage,Cost: Aircraft time out of service (hours),Cost: Total $,Feet above ground,Miles from airport,Wildlife: Animal Category,Wildlife: Species Order,Wildlife: Species Group,Wildlife: Species,Number of Strikes
0,APALACHICOLA REGIONAL ARPT,Florida,United States,,,2012-09-20 19:30:00,,Take-off run,,,No damage,,0,0.0,,Terrestrial Mammals,Canids,"Wolves, Dogs, Foxes, Coyote",Domestic dog,1
1,APALACHICOLA REGIONAL ARPT,Florida,United States,Airplane,1.0,2013-04-23 17:09:00,,Take-off run,,,No damage,,0,,,Birds,"Pelicans, Herons, Egrets, Bitterns, Ibises","Herons, Egrets, Bitterns","Herons, egrets, bitterns",1
2,LEHIGH VALLEY INTL,Pennsylvania,United States,Airplane,2.0,2009-04-23 09:22:00,Day,Take-off run,Medium,Aborted Take-off,Caused damage,720.0,171132,0.0,0.0,Birds,"Caracaras, Falcons","Caracaras, Falcons",American kestrel,1
3,LEHIGH VALLEY INTL,Pennsylvania,United States,Airplane,2.0,2014-04-13 22:00:00,Night,Approach,,,No damage,15.0,600,,,Birds,Pigeons and Doves,Doves,Mourning dove,1
4,LEHIGH VALLEY INTL,Pennsylvania,United States,Airplane,2.0,2009-03-31 18:15:00,Day,Approach,Medium,,Caused damage,12.0,188245,,0.0,Birds,"Hawks, Kites, Eagles, Ospreys, Vultures","Kites, Hawks, Eagles",Red-tailed hawk,1
5,LEHIGH VALLEY INTL,Pennsylvania,United States,Airplane,2.0,2002-03-19 09:09:00,Day,Take-off run,,Aborted Take-off,No damage,1.0,0,0.0,0.0,Birds,"Gulls, Terns, Sandpipers, Plovers, Skimmers","Gulls, terns, kittiwakes",Gulls,1
6,LEHIGH VALLEY INTL,Pennsylvania,United States,Airplane,3.0,2014-04-10 18:15:00,Day,Take-off run,,,No damage,1.0,0,0.0,0.0,Birds,"Hawks, Kites, Eagles, Ospreys, Vultures","Kites, Hawks, Eagles",Red-tailed hawk,1
7,LEHIGH VALLEY INTL,Pennsylvania,United States,Airplane,,2002-08-26 13:05:00,Day,Landing Roll,,Aborted Take-off,No damage,0.0,0,0.0,0.0,Birds,Perching Birds,"Crows, Ravens, Jays, Magpies",American crow,1
8,LEHIGH VALLEY INTL,Pennsylvania,United States,Airplane,,2005-08-10 18:05:00,Day,Take-off run,,,No damage,0.0,0,0.0,0.0,Birds,Pigeons and Doves,Doves,Mourning dove,1
9,LEHIGH VALLEY INTL,Pennsylvania,United States,Airplane,,2000-09-14 14:40:00,Day,Climb,,Precautionary Landing,No damage,0.0,0,300.0,,Birds,"Hawks, Kites, Eagles, Ospreys, Vultures","Kites, Hawks, Eagles",Red-tailed hawk,1


#### creating multiple hdf files with different view levels will help to reduce the load time

In [12]:
# agg by Origin State, Country, Number of Strikes
state_country_view = df.copy()
# renaming columns which has spaces, replacing spaces with underscores
# converting object columns to str as hdf expects them
state_country_view[['Country', 'Origin State']] = state_country_view[['Country', 'Origin State']].astype('str')
state_country_view = state_country_view.rename(columns={'Origin State': 'Origin_State'})
state_country_view = state_country_view.groupby(['Country', 'Origin_State']).agg({'Number of Strikes': 'sum'})
state_country_view = state_country_view.sort_values('Number of Strikes', ascending=False)

In [13]:
state_country_view.to_hdf('data_views.h5', key='state_country_view', mode='w')

In [14]:
%timeit pd.read_hdf('data_views.h5', key='state_country_view', mode='r')

100 loops, best of 3: 7.12 ms per loop


In [15]:
state_country_df = pd.read_hdf('data_views.h5', key='state_country_view', mode='r')

In [16]:
state_country_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Strikes
Country,Origin_State,Unnamed: 2_level_1
United States,California,3026
United States,Texas,2306
United States,Florida,2239
United States,New York,2140
United States,Pennsylvania,1286
United States,Colorado,1146
United States,Ohio,1107
United States,Illinois,1088
United States,Michigan,925
United States,Missouri,890


In [17]:
state_country_df.loc[('United States', ['North Dakota', 'Kansas', 'Montana']), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Strikes
Country,Origin_State,Unnamed: 2_level_1
United States,North Dakota,191
United States,Kansas,153
United States,Montana,89


#### hdf5 preserves the index