# Preparing data files for open labs from raw data files

In [2]:
import pandas as pd

## Preparing data from `raw_data/Wildlife_Export_1272020.csv`

In [3]:
# Load Wildlife_Export_1272020.csv
raw_data = pd.read_csv('./raw_data/Wildlife_Export_1272020.csv')
raw_data[:2]

Unnamed: 0,INDX_NR,INCIDENT_DATE,INCIDENT_MONTH,INCIDENT_YEAR,TIME,TIME_OF_DAY,AIRPORT_ID,AIRPORT,RUNWAY,STATE,...,SIZE,NR_INJURIES,NR_FATALITIES,COMMENT,REPORTER_NAME,REPORTER_TITLE,SOURCE,PERSON,LUPDATE,TRANSFER
0,1080125,2020-11-05,11,2020,05:00,,KRDU,RALEIGH-DURHAM INTL,23R,NC,...,Small,,,,REDACTED,REDACTED,FAA Form 5200-7-E,Airport Operations,2020-12-04,False
1,1080118,2020-11-05,11,2020,22:35,,KGSO,PIEDMONT TRIAD INTL,5R,NC,...,Large,,,,REDACTED,REDACTED,FAA Form 5200-7-E,Airport Operations,2020-12-04,False


### Extract columns to create dataset of index number and species info

This dataset is for the merge activity in open lab 2

In [4]:
# Create DataFrame containing species ID  and species name
strikes_species_data = raw_data[['SPECIES_ID', 'SPECIES']]
species_id_table = strikes_species_data.drop_duplicates(subset=['SPECIES_ID'])

# Create DataFrame of all strikes data except species ID and species name
strikes_no_species = raw_data.drop(columns=['SPECIES'])

In [10]:
# Create CSV file that contains only species data
species_id_table.to_csv('FAA_Wildlife_species_id_table.csv', index=False)

### Split data without species info into three different data file types (CSV, Excel, JSON) based on decade
These datasets are for the loading files activity in lab 1 and the DataFrame concatenation activity in lab 2

*Note: the 1990s dataset contains one record from 1985 and the 2010s dataset contains partial records from the year 2020*

In [11]:
# Split dataset into three datasets based on record decade (1980-1999, 2000-2009, 2010-2019)
strikes_decades = []
for year in range(1990, 2020, 10):
    strikes_decades.append(strikes_no_species[(strikes_no_species['INCIDENT_YEAR'] >= year)
    & (strikes_no_species['INCIDENT_YEAR'] < year + 10)])

In [12]:
# Combine first record in original dataset (from 1985) with 90s data
strikes_90s = strikes_decades[0].append(
    strikes_no_species[strikes_no_species['INCIDENT_YEAR'] < 1990])
# Create CSV file of data from 1990 - 1999 (actually contains one record from 1985)
strikes_90s.to_csv('FAA_Wildlife_strikes_1990-1999.csv', index=False)

# Create Excel file of data from 2000 - 2009
strikes_decades[1].to_excel('FAA_Wildlife_strikes_2000-2009.xlsx', index=False)

# Concatenate third and fourth dataset to create dataset of records from 2010-2020
strikes_10s = strikes_no_species[strikes_no_species['INCIDENT_YEAR'] > 2019].append(strikes_decades[2])
# Set index to FAA index number to format data for export
strikes_10s_new_index = strikes_10s.set_index('INDX_NR')
# Create JSON file of data from 2010 - 2019 (actually contains records from 2020)
strikes_10s_new_index.to_json('FAA_Wildlife_strikes_2010-2019.json', orient='index')

## Create cleaned dataset for open lab 3

In [8]:
# Strip whitespace from column headers and convert to lowercase
column_format = raw_data.rename(columns=lambda x: x.strip().lower())
# Drop columns based on starting characters (dam_, str_, or reporter_)
column_drop = column_format.drop(columns=column_format.filter(regex='^dam_|str_|reporter_').columns)
# Drop other columns
column_drop = column_drop.drop(columns=['state', 'faaregion', 'lupdate', 'transfer'])
# Generalize the unknown species types to one type
generalize_unknown_species = column_drop.replace({'species': 'Unknown.*'}, 'Unknown flying animal', regex=True)
generalize_unknown_species

Unnamed: 0,indx_nr,incident_date,incident_month,incident_year,time,time_of_day,airport_id,airport,runway,location,...,remains_sent,warned,birds_seen,birds_struck,size,nr_injuries,nr_fatalities,comment,source,person
0,1080125,2020-11-05,11,2020,05:00,,KRDU,RALEIGH-DURHAM INTL,23R,,...,True,Unknown,,1,Small,,,,FAA Form 5200-7-E,Airport Operations
1,1080118,2020-11-05,11,2020,22:35,,KGSO,PIEDMONT TRIAD INTL,5R,,...,False,Unknown,1,1,Large,,,,FAA Form 5200-7-E,Airport Operations
2,1080126,2020-11-05,11,2020,11:00,Day,KCLT,CHARLOTTE/DOUGLAS INTL ARPT,36C,,...,False,Yes,,1,,,,,FAA Form 5200-7-E,Airport Operations
3,1080130,2020-11-05,11,2020,06:52,Day,KCLT,CHARLOTTE/DOUGLAS INTL ARPT,36C,,...,False,Yes,,1,,,,,FAA Form 5200-7-E,Airport Operations
4,1078243,2020-11-04,11,2020,05:20,Dawn,KRDU,RALEIGH-DURHAM INTL,23R,,...,True,Unknown,,1,Small,,,,FAA Form 5200-7-E,Airport Operations
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4959,611224,1990-06-01,6,1990,,Day,KRDU,RALEIGH-DURHAM INTL,,,...,False,No,,2-10,Small,,,DATE = POSTMARK /Legacy Record=XXXXXX/,FAA Form 5200-7,
4960,609163,1990-05-04,5,1990,11:35,Day,KCLT,CHARLOTTE/DOUGLAS INTL ARPT,18R,,...,False,Unknown,1,1,Small,,,/Legacy Record=XXXXXX/,FAA Form 5200-7,Tower
4961,611004,1990-04-20,4,1990,,Day,KCLT,CHARLOTTE/DOUGLAS INTL ARPT,18L,,...,False,No,1,1,Medium,,,OPER = HENRY HARDING /Legacy Record=XXXXXX/,FAA Form 5200-7,
4962,828185,1990-03-21,3,1990,21:30,Night,KGSO,PIEDMONT TRIAD INTL,,,...,False,Unknown,1,1,Medium,,,SOURCE = BASH NR XXXXX /Legacy Record=XXXX/,BASH,
