# Preparing data files for open labs from raw data files

In [1]:
import pandas as pd

## Preparing data from `raw_data/Wildlife_Export_1272020.csv`

In [2]:
# Load Wildlife_Export_1272020.csv
raw_data = pd.read_csv('./raw_data/Wildlife_Export_1272020.csv')
raw_data[:2]

Unnamed: 0,INDX_NR,INCIDENT_DATE,INCIDENT_MONTH,INCIDENT_YEAR,TIME,TIME_OF_DAY,AIRPORT_ID,AIRPORT,RUNWAY,STATE,...,SIZE,NR_INJURIES,NR_FATALITIES,COMMENT,REPORTER_NAME,REPORTER_TITLE,SOURCE,PERSON,LUPDATE,TRANSFER
0,1080125,2020-11-05,11,2020,05:00,,KRDU,RALEIGH-DURHAM INTL,23R,NC,...,Small,,,,REDACTED,REDACTED,FAA Form 5200-7-E,Airport Operations,2020-12-04,False
1,1080118,2020-11-05,11,2020,22:35,,KGSO,PIEDMONT TRIAD INTL,5R,NC,...,Large,,,,REDACTED,REDACTED,FAA Form 5200-7-E,Airport Operations,2020-12-04,False


### Extract columns to create dataset of species ID and species name

This dataset is for the merge activity in open lab 2

In [3]:
# Create DataFrame containing species ID  and species name
strikes_species_data = raw_data[['SPECIES_ID', 'SPECIES']]
species_id_table = strikes_species_data.drop_duplicates(subset=['SPECIES_ID'])

# Export CSV file that contains only species data
species_id_table.to_csv('FAA_Wildlife_species_id_table.csv', index=False)

### Extract columns to create dataset of operator ID and operator name (i.e., airline operator)

This dataset is for the merge exercise in open lab 2

In [4]:
# Create DataFrame containing operator ID and operator name
strikes_operator_data = raw_data[['OPID', 'OPERATOR']]
species_id_table = strikes_operator_data.drop_duplicates(subset=['OPID'])

# Export CSV file that contains only operator data
species_id_table.to_csv('FAA_Wildlife_operator_id_table.csv', index=False)

### Split data without species info into three different data file types (CSV, Excel, JSON) based on decade
These datasets are for the loading files activity in lab 1 and the DataFrame concatenation activity in lab 2

*Note: the 1990s dataset contains one record from 1985 and the 2010s dataset contains partial records from the year 2020*

In [5]:
# Create DataFrame of all strikes data except species name and operator name
final_main_dataframe = raw_data.drop(columns=['SPECIES', 'OPERATOR'])

In [6]:
# 1990-1999 dataset (includes errant data point from 1985)
strikes_90s = final_main_dataframe[final_main_dataframe['INCIDENT_YEAR'] < 2000]

# 2000-2009 dataset
strikes_00s = final_main_dataframe[
    (final_main_dataframe['INCIDENT_YEAR'] >= 2000)
    & (final_main_dataframe['INCIDENT_YEAR'] < 2010)
]

# 2010-2019 dataset (includes errant data from 2020)
strikes_10s = final_main_dataframe[final_main_dataframe['INCIDENT_YEAR'] >= 2010]
# Set index to FAA index number to format data for export
strikes_10s = strikes_10s.set_index('INDX_NR')

# Create CSV file of data from 1990 - 1999 (includes errant data point from 1985)
strikes_90s.to_csv('FAA_Wildlife_strikes_1990-1999.csv', index=False)

# Create Excel file of data from 2000 - 2009
strikes_00s.to_excel('FAA_Wildlife_strikes_2000-2009.xlsx', index=False)

# Create JSON file of data from 2010 - 2019 (actually contains records from 2020)
strikes_10s.to_json('FAA_Wildlife_strikes_2010-2019.json')

## Create cleaned dataset for open lab 3

- Drop unnecesary columns
- Add calculated columns from lab 2 ('SINGLE_OR_MULTI_ENGINE', 'HOUR', 'MONTH_NAME')
- Create new columns for easier completion of activities ('SPECIES_GENERALIZE', 'SPECIES_TYPE')

In [36]:
# Drop columns based on starting characters (dam_, str_, or reporter_)
column_drop = raw_data.drop(columns=raw_data.filter(regex='^DAM_|STR_|ENG_|REPORTER_').columns)
# Drop other columns
column_drop = column_drop.drop(columns=['STATE', 'FAAREGION', 'LUPDATE', 'TRANSFER'])

# Add column 'SINGLE_OR_MULTI_ENGINE'
column_drop['SINGLE_OR_MULTI_ENGINE'] = [
    'Single-engine' if val == 1 else 'Multi-engine' for val in column_drop['NUM_ENGS']
]

# Define a function that takes a time string in the form "HH:MM" and returns the
# hour as an integer if the hour value is valid
def calc_hour(time_str):
    hour = time_str.split(':')[0]
    if hour.strip(' ') != '':
        return int(hour)

# Add column 'HOUR' that contains numeric hour in which stirke occurred
column_drop['HOUR'] = column_drop['TIME'].apply(calc_hour)

# Add column 'MONTH_NAME' that contains the month name in which a strike occurred
column_drop['MONTH_NAME'] = pd.to_datetime(
    column_drop['INCIDENT_DATE ']).dt.month_name()

# Add new column 'SPECIES_GENERALIZE' containing species names that generalizes the unknown species types to one type
column_drop['SPECIES_GENERALIZE'] = column_drop['SPECIES'].replace({'Unknown.*': 'Unknown flying animal'}, regex=True)
column_drop

# Add new column 'SPECIES_TYPE' that contains values indicating if wildlife was a land or flying animal
# column_drop['SPECIES_TYPE'] = column_drop['SPECIES']
column_drop['SPECIES_TYPE'] = column_drop['SPECIES_ID'].str.contains(
    '^1|2', regex=True).replace({True: 'Land animal', False: 'Flying animal'})

column_drop.to_csv('FAA_Wildlife_strikes_clean.csv', index=False)