In [106]:
# load in libraries
import pandas as pd
import os
import pdfplumber
import re
import folium

In [107]:
# read in airports data
airports_dir = os.path.join(os.getcwd(), '../data/raw_data/our_airports_raw/airports.csv')
airports = pd.read_csv(
    airports_dir,
    # usecols=[  # these columns exist in the dataset
        # 'id',
        # 'ident',
        # 'type',
        # 'name',
        # 'latitude_deg',
        # 'longitude_deg',
        # 'elevation_ft',
        # 'iso_country',
        # 'iso_region',
        # 'icao_code',
        # 'iata_code',
        # 'local_code',
        # 'scheduled_service'
    # ],
    encoding='utf-8'
)
airports = airports.rename(columns={'id': 'airport_id'})

In [108]:
# view data
print(airports.head())
airports.info()

   airport_id ident           type                  name  latitude_deg  \
0        6523   00A       heliport     Total RF Heliport     40.070985   
1      323361  00AA  small_airport  Aero B Ranch Airport     38.704022   
2        6524  00AK  small_airport          Lowell Field     59.947733   
3        6525  00AL  small_airport          Epps Airpark     34.864799   
4      506791  00AN  small_airport  Katmai Lodge Airport     59.093287   

   longitude_deg  elevation_ft continent iso_country iso_region  municipality  \
0     -74.933689          11.0       NaN          US      US-PA      Bensalem   
1    -101.473911        3435.0       NaN          US      US-KS         Leoti   
2    -151.692524         450.0       NaN          US      US-AK  Anchor Point   
3     -86.770302         820.0       NaN          US      US-AL       Harvest   
4    -156.456699          80.0       NaN          US      US-AK   King Salmon   

  scheduled_service icao_code iata_code gps_code local_code  \
0    

In [109]:
# read in runways data
runways_dir = os.path.join(os.getcwd(), '../data/raw_data/our_airports_raw/runways.csv')
runways = pd.read_csv(
    runways_dir,
    # usecols=[  # these columns exist in the dataset
        # 'id', 
        # 'airport_ident', 
        # 'length_ft', 
        # 'width_ft', 
        # 'surface'
    # ],
    encoding='utf-8'
)
runways = runways.rename(columns={'id': 'runways_id'})

In [110]:
# view data
print(runways.head())
runways.info()

   runways_id  airport_ref airport_ident  length_ft  width_ft surface  \
0      269408         6523           00A       80.0      80.0  ASPH-G   
1      255155         6524          00AK     2500.0      70.0    GRVL   
2      254165         6525          00AL     2300.0     200.0    TURF   
3      506792       506791          00AN     4517.0      60.0     GVL   
4      322128       322127          00AS     1450.0      60.0    Turf   

   lighted  closed le_ident  le_latitude_deg  le_longitude_deg  \
0        1       0       H1              NaN               NaN   
1        0       0        N              NaN               NaN   
2        0       0       01              NaN               NaN   
3        0       0        3              NaN               NaN   
4        0       0        1              NaN               NaN   

   le_elevation_ft  le_heading_degT  le_displaced_threshold_ft he_ident  \
0              NaN              NaN                        NaN      NaN   
1             

In [111]:
# Read in countries data
countries_dir = os.path.join(os.getcwd(), '../data/raw_data/our_airports_raw/countries.csv')
countries = pd.read_csv(
    countries_dir,
    # usecols=[  # these columns exist in the dataset
        # 'id', 
        # 'code', 
        # 'name'
    # ],
    encoding='utf-8'
)
countries = countries.rename(columns={'id': 'countries_id'})

In [112]:
# view
print(countries.head())
countries.info()

   countries_id code                  name continent  \
0        302672   AD               Andorra        EU   
1        302618   AE  United Arab Emirates        AS   
2        302619   AF           Afghanistan        AS   
3        302722   AG   Antigua and Barbuda       NaN   
4        302723   AI              Anguilla       NaN   

                                      wikipedia_link  \
0              https://en.wikipedia.org/wiki/Andorra   
1  https://en.wikipedia.org/wiki/United_Arab_Emir...   
2          https://en.wikipedia.org/wiki/Afghanistan   
3  https://en.wikipedia.org/wiki/Antigua_and_Barbuda   
4             https://en.wikipedia.org/wiki/Anguilla   

                                 keywords  
0                       Andorran airports  
1  UAE,مطارات في الإمارات العربية المتحدة  
2                                     NaN  
3                       Antiguan airports  
4                                     NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries,

In [113]:
# Construct the file path
regions_dir = os.path.join(os.getcwd(), '../data/raw_data/our_airports_raw/regions.csv')

# Read the CSV with selected columns
regions = pd.read_csv(
    regions_dir,
    # usecols=[
        # 'id',
        # 'code',
        # 'name',
        # 'iso_country'
    # ],
    encoding='utf-8'
)
regions = regions.rename(columns={'id': 'regions_id'})

In [114]:
# Display first few rows to verify
print(regions.head())
regions.info()

   regions_id   code local_code                        name continent  \
0      302811  AD-02         02              Canillo Parish        EU   
1      302812  AD-03         03               Encamp Parish        EU   
2      302813  AD-04         04           La Massana Parish        EU   
3      302814  AD-05         05               Ordino Parish        EU   
4      302815  AD-06         06  Sant Julià de Lòria Parish        EU   

  iso_country                                     wikipedia_link  \
0          AD              https://en.wikipedia.org/wiki/Canillo   
1          AD               https://en.wikipedia.org/wiki/Encamp   
2          AD           https://en.wikipedia.org/wiki/La_Massana   
3          AD               https://en.wikipedia.org/wiki/Ordino   
4          AD  https://en.wikipedia.org/wiki/Sant_Julià_de_Lòria   

                                 keywords  
0              Airports in Canillo Parish  
1               Airports in Encamp Parish  
2           Airports

In [115]:
# Step 1: Join airports with runways on ident = airport_ident
airports_joined = pd.merge(
    airports,
    runways,
    left_on='ident',
    right_on='airport_ident',
    how='left'
)

In [116]:
# Step 2: Join with countries on iso_country = code with suffixes
airports_joined = pd.merge(
    airports_joined,
    countries[['code', 'name']],
    left_on='iso_country',
    right_on='code',
    how='left',
    suffixes=('', '_country')
)

# Rename 'name_country' to 'country_name'
airports_joined = airports_joined.rename(columns={'name_country': 'country_name'})


In [117]:
# Step 3: Join with regions on iso_region = code with suffixes
airports_joined = pd.merge(
    airports_joined,
    regions[['code', 'name']],
    left_on='iso_region',
    right_on='code',
    how='left',
    suffixes=('', '_region')
)

# Rename 'name_region' to 'region_name'
airports_joined = airports_joined.rename(columns={'name_region': 'region_name'})

In [118]:
# Step 4: Select required columns
columns_to_keep = [
    'ident',
    'iata_code',
    'icao_code',
    'local_code',
    'name',          # airport name (no suffix)
    'type',
    # 'scheduled_service',
    'latitude_deg',
    'longitude_deg',
    'length_ft',
    'elevation_ft',
    'surface',
    'country_name',
    'region_name'
]
airports_joined = airports_joined[columns_to_keep]

In [119]:
# view
print(airports_joined.head())

  ident iata_code icao_code local_code                  name           type  \
0   00A       NaN       NaN        00A     Total RF Heliport       heliport   
1  00AA       NaN       NaN       00AA  Aero B Ranch Airport  small_airport   
2  00AK       NaN       NaN       00AK          Lowell Field  small_airport   
3  00AL       NaN       NaN       00AL          Epps Airpark  small_airport   
4  00AN       NaN       NaN       00AN  Katmai Lodge Airport  small_airport   

   latitude_deg  longitude_deg  length_ft  elevation_ft surface  \
0     40.070985     -74.933689       80.0          11.0  ASPH-G   
1     38.704022    -101.473911        NaN        3435.0     NaN   
2     59.947733    -151.692524     2500.0         450.0    GRVL   
3     34.864799     -86.770302     2300.0         820.0    TURF   
4     59.093287    -156.456699     4517.0          80.0     GVL   

    country_name   region_name  
0  United States  Pennsylvania  
1  United States        Kansas  
2  United States       

In [120]:
# view the dataset info
airports_joined.info()
airports_joined.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90057 entries, 0 to 90056
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ident          90057 non-null  object 
 1   iata_code      12185 non-null  object 
 2   icao_code      11911 non-null  object 
 3   local_code     39858 non-null  object 
 4   name           90057 non-null  object 
 5   type           90057 non-null  object 
 6   latitude_deg   90057 non-null  float64
 7   longitude_deg  90057 non-null  float64
 8   length_ft      46601 non-null  float64
 9   elevation_ft   75309 non-null  float64
 10  surface        46354 non-null  object 
 11  country_name   90057 non-null  object 
 12  region_name    90057 non-null  object 
dtypes: float64(4), object(9)
memory usage: 8.9+ MB


Unnamed: 0,ident,iata_code,icao_code,local_code,name,type,latitude_deg,longitude_deg,length_ft,elevation_ft,surface,country_name,region_name
count,90057,12185,11911,39858,90057,90057,90057.0,90057.0,46601.0,75309.0,46354,90057,90057
unique,83127,9082,8761,34161,78642,7,,,,,657,246,2862
top,KORD,ORD,KORD,ORD,Centre Hospitalier Heliport,small_airport,,,,,ASP,United States,Texas
freq,11,11,11,11,49,45747,,,,,11266,36110,4292
mean,,,,,,,26.314271,-28.989821,3213.479475,1278.346307,,,
std,,,,,,,26.202426,85.425556,2695.554594,1645.79261,,,
min,,,,,,,-90.0,-179.876999,0.0,-1266.0,,,
25%,,,,,,,14.734519,-93.989403,1600.0,203.0,,,
50%,,,,,,,35.406104,-69.533819,2660.0,720.0,,,
75%,,,,,,,43.251894,21.77594,4150.0,1575.0,,,


In [121]:
# Display all unique values for each categorical column
for col in airports_joined.select_dtypes(include=['object', 'category']).columns:
    print(f"\nUnique values in '{col}':")
    print(airports_joined[col].unique())
    print(len(airports_joined[col].unique()))


Unique values in 'ident':
['00A' '00AA' '00AK' ... 'ZZ-0002' 'ZZ-0003' 'ZZZZ']
83127

Unique values in 'iata_code':
[nan 'UTK' 'OCA' ... 'XEN' 'YNJ' 'YKH']
9083

Unique values in 'icao_code':
[nan 'HCAD' 'OATD' ... 'ZYXC' 'ZYYJ' 'ZYYK']
8762

Unique values in 'local_code':
['00A' '00AA' '00AK' ... 'YTW' '87TX' 'RJX7']
34162

Unique values in 'name':
['Total RF Heliport' 'Aero B Ranch Airport' 'Lowell Field' ...
 'Glorioso Islands Airstrip' 'Fainting Goat Airport'
 'Satsuma Iōjima Airport']
78642

Unique values in 'type':
['heliport' 'small_airport' 'seaplane_base' 'closed' 'balloonport'
 'medium_airport' 'large_airport']
7

Unique values in 'surface':
['ASPH-G' nan 'GRVL' 'TURF' 'GVL' 'Turf' 'GRAVEL' 'ASPH' 'TURF-F' 'MATS'
 'CONC' 'TURF-G' 'CON' 'Turf/Dirt' 'TURF-P' 'GRAVEL-F' 'ASPH-TRTD'
 'TURF-GRVL' 'WATER' 'ASPH-TURF' 'DIRT' 'CONC-G' 'DIRT-P' 'DIRT-TURF-G'
 'PSP' 'CONC-TURF' 'Dirt' 'DIRT-G' 'TURF-DIRT' 'ASP' 'GRVL-DIRT' 'DIRT-F'
 'GRVL-G' 'ASPH-CONC-G' 'WATER-E' 'CONC-E' 'TURF-GRVL

In [122]:
# Filter for valid airport types
valid_types = ['large_airport', 'medium_airport', 'small_airport']
filtered = airports_joined[airports_joined['type'].isin(valid_types)].copy()

In [123]:
# view
print(filtered.head())

  ident iata_code icao_code local_code                  name           type  \
1  00AA       NaN       NaN       00AA  Aero B Ranch Airport  small_airport   
2  00AK       NaN       NaN       00AK          Lowell Field  small_airport   
3  00AL       NaN       NaN       00AL          Epps Airpark  small_airport   
4  00AN       NaN       NaN       00AN  Katmai Lodge Airport  small_airport   
5  00AS       NaN       NaN       00AS        Fulton Airport  small_airport   

   latitude_deg  longitude_deg  length_ft  elevation_ft surface  \
1     38.704022    -101.473911        NaN        3435.0     NaN   
2     59.947733    -151.692524     2500.0         450.0    GRVL   
3     34.864799     -86.770302     2300.0         820.0    TURF   
4     59.093287    -156.456699     4517.0          80.0     GVL   
5     34.942803     -97.818019     1450.0        1100.0    Turf   

    country_name region_name  
1  United States      Kansas  
2  United States      Alaska  
3  United States     Alabama 

In [124]:
# Define surface type keywords
surface_types = ['asp', 'conc', 'groov', 'tar', 'tarmac', 'cem', 'pav']
pattern = '|'.join(surface_types)

# Filter rows
filtered = filtered[filtered['surface'].str.lower().str.contains(pattern, na=False)]

def standardize_surface(surface):
    s = surface.lower()
    if any(sub in s for sub in ['asp', 'pav']):
        return 'asphalt'
    elif any(sub in s for sub in ['conc', 'groov', 'cem']):
        return 'grooved concrete'
    elif any(sub in s for sub in ['tar', 'tarmac']):
        return 'tarmac'
    else:
        return s  # fallback to lowercase original

filtered['surface'] = filtered['surface'].apply(standardize_surface)

# filter for asphate, tarmac, and grooved concrete
# surface_types = [
  #   'asp', 'conc', 'groov', 'tar', 'tarmac', 'cem', 'pav'
    # 'ASP', 'ASF', 'CONC-G', 'CONC-TURF-G', 'CONCRETE - GROOVED',
    # 'CONC-GRVD', 'CONCRETE/GROOVED', 'CONC-TRTD', 'GROO'
# ]

# pattern = '|'.join(surface_types)

# filtered = filtered[filtered['surface'].str.lower().str.contains(pattern, na=False)]

In [125]:
# view
print(filtered.head())

   ident iata_code icao_code local_code                          name  \
7   00CA       NaN       NaN       00CA       Goldstone (GTS) Airport   
32  00NC       NaN       NaN       00NC         North Raleigh Airport   
64  01CL       NaN       NaN       01CL     Swansboro Country Airport   
93  01MT       NaN       NaN       01MT  Crystal Lakes Resort Airport   
96  01NC       NaN       NaN       01NC               Topsail Airpark   

             type  latitude_deg  longitude_deg  length_ft  elevation_ft  \
7   small_airport     35.354740    -116.885329     6000.0        3038.0   
32  small_airport     36.085201     -78.371399     2650.0         348.0   
64  small_airport     38.799900    -120.734001     3100.0        2594.0   
93  small_airport     48.789101    -114.879997     5000.0        3141.0   
96  small_airport     34.475300     -77.581398     2000.0          65.0   

    surface   country_name     region_name  
7   asphalt  United States      California  
32  asphalt  United 

In [126]:
# Filter for runway length >= 5000 ft
filtered = filtered[filtered['length_ft'] >= 5000]

In [127]:
# Ensure country is US and region is in CONUS, Alaska, or Hawaii
valid_regions = [
    'alaska', 'hawaii',
    'alabama', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut',
    'delaware', 'florida', 'georgia', 'idaho', 'illinois', 'indiana', 'iowa',
    'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts',
    'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'nebraska',
    'nevada', 'new hampshire', 'new jersey', 'new mexico', 'new york',
    'north carolina', 'north dakota', 'ohio', 'oklahoma', 'oregon',
    'pennsylvania', 'rhode island', 'south carolina', 'south dakota',
    'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington',
    'west virginia', 'wisconsin', 'wyoming'
]

# Filter the dataframe
filtered = filtered[
    (filtered['country_name'].str.strip().str.lower() == 'united states') &
    (filtered['region_name'].str.strip().str.lower().isin(valid_regions))
]

In [128]:
# only include airports in the National Plan of Integrated Airport Systems (NPIAS)
# and active miltiary airports
npias_df = pd.read_excel("../data/raw_data/npias.xlsx", sheet_name="All NPIAS Airports")

# Get all unique LocID values (drop NaNs just in case)
npias_codes = npias_df['LocID'].dropna().astype(str).str.lower().unique()

# Read the military airports Excel sheet
military_df = pd.read_excel("../data/raw_data/military_airports.xlsx")

# Get all unique ICAO/FAA LID values
military_codes = military_df['ICAO or FAA LID'].dropna().astype(str).str.lower().unique()

# Combine both code lists into a single set for fast lookup
valid_codes = set(npias_codes) | set(military_codes)

# Filter your 'filtered' DataFrame based on matches in any of the four columns
matches = (
    filtered['ident'].astype(str).str.lower().isin(valid_codes) |
    filtered['iata_code'].astype(str).str.lower().isin(valid_codes) |
    filtered['icao_code'].astype(str).str.lower().isin(valid_codes) |
    filtered['local_code'].astype(str).str.lower().isin(valid_codes)
)

# filter
filtered = filtered[matches]

In [129]:
# First, sort to ensure consistent order of runways per airport
filtered_sorted = filtered.sort_values(['ident', 'length_ft'])

# Group and aggregate both length and surface together using zip
def aggregate_runway_info(group):
    lengths = group['length_ft'].dropna().astype(int).astype(str).tolist()
    surfaces = group['surface'].fillna('').astype(str).tolist()
    return pd.Series({
        'runway_lengths_ft': ','.join(lengths),
        'runway_surfaces': ','.join(surfaces[:len(lengths)])  # ensure matching length
    })

# Apply aggregation
runway_info = filtered_sorted.groupby('ident').apply(aggregate_runway_info).reset_index()

# Now deduplicate original data, drop runway fields, and merge the new aggregated columns
filtered = filtered_sorted.drop_duplicates(subset=['ident']).copy()
filtered = filtered.drop(columns=['length_ft', 'surface'], errors='ignore')
filtered = filtered.merge(runway_info, on='ident', how='left')

# Remove airports not in continental United States, Alaska, or Hawaii
filtered = filtered[
    (filtered['country_name'].str.strip().str.lower() == 'united states') &
    (filtered['region_name'].str.strip().str.lower().isin(valid_regions))
]

# Assuming your DataFrame is called filtered and has multiple rows per airport,
# each with a 'length_ft' value for one runway.

# Group by 'ident' (airport id), aggregate runway lengths into a comma-separated string:
# runway_lengths = filtered.groupby('ident')['length_ft'] \
#                         .apply(lambda x: ','.join(x.dropna().astype(int).astype(str))) \
#                        .reset_index()

# Now get one row per airport from filtered (drop duplicates keeping first):
# filtered = filtered.drop_duplicates(subset=['ident']).copy()

# Merge the aggregated runway lengths back into the single-row airport data:
# filtered = filtered.drop(columns=['length_ft'])  # drop original length_ft column
# filtered = filtered.merge(runway_lengths, on='ident', how='left')

# Rename length_ft column to something more descriptive:
# filtered = filtered.rename(columns={'length_ft': 'runway_lengths'})


  runway_info = filtered_sorted.groupby('ident').apply(aggregate_runway_info).reset_index()


In [130]:
# flag airports that are considered Airtanker Bases
# according to https://ftp.wildfire.gov/public/incident_specific_data/n_rockies/IncidentAviationManagers/AirTanker-Retardant/pms507-ATB-directory2018.pdf
# Open the PDF file
pdf_path = "../data/raw_data/pms507-ATB-directory2018.pdf"

# list to hold extracted airport names
airtanker_base_names = []

# Extract normalized names from the PDF
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            # Match line like "San Bernardino International Airport–Elevation: 1,159’"
            match = re.search(r"^(.*?)–Elevation", text, re.MULTILINE)
            if match:
                name = match.group(1).strip().lower()  # Normalize to lowercase and strip
                airtanker_base_names.append(name)


# Deduplicate
airtanker_base_names = set(airtanker_base_names)

# reassign
# filtered = airports_joined.copy()

# Normalize and flag based on cleaned name
def is_airtanker_base_by_name(row):
    name = str(row['name']).strip().lower()
    return name in airtanker_base_names

# add airtanker base flag
filtered['airtanker_base'] = filtered.apply(is_airtanker_base_by_name, axis=1)

# Keep a copy of all airtanker base rows before any filtering
# airtanker_bases = filtered[filtered['airtanker_base'] == 1].copy()
# airtanker_bases.shape

In [131]:
# airtanker_bases

In [132]:
# Find airtanker bases NOT in filtered by 'ident'
# missing_airtanker_bases = airtanker_bases[~airtanker_bases['ident'].isin(filtered['ident'])]

# Concatenate only the missing airtanker bases back into filtered
# filtered = pd.concat([filtered, missing_airtanker_bases], ignore_index=True)

In [133]:
num_airtanker_bases = filtered['airtanker_base'].sum()
print(f"Airtanker bases found: {num_airtanker_bases}")
duplicate_count = filtered['ident'].duplicated().sum()
print(f"Number of duplicate 'ident' values: {duplicate_count}")
# Count null values in each column of the filtered dataset
null_counts = filtered.isnull().sum()
# Print the result
print("Null values in each column:")
print(null_counts.sort_values(ascending=False))

Airtanker bases found: 45
Number of duplicate 'ident' values: 0
Null values in each column:
icao_code            429
iata_code            388
ident                  0
local_code             0
name                   0
type                   0
latitude_deg           0
longitude_deg          0
elevation_ft           0
country_name           0
region_name            0
runway_lengths_ft      0
runway_surfaces        0
airtanker_base         0
dtype: int64


In [134]:
# Columns to keep 
# columns_to_keep = [
   #  'ident', 'iata_code', 'icao_code', 'local_code', 'runway_lengths_ft',
   #  'name', 'type', 'runway_surfaces', # 'scheduled_service',
   #  'country_name', 'region_name', 'latitude_deg', 'longitude_deg'
# ]

# keep only the select few columns
# filtered = filtered[columns_to_keep]

In [135]:
# write to a .csv
filtered.to_csv('../data/processed_data/airports_runways_joined.csv', index=False)

In [136]:
# just airport information
# Make a copy to preserve original with runway info
airports_processed = filtered.drop(columns=['runway_lengths_ft', 'runway_surfaces'], errors='ignore')

# Optional: reorder columns for clarity
airports_processed = airports_processed[[
    'ident', 'iata_code', 'icao_code', 'local_code', 'name',
    'type', 'latitude_deg', 'longitude_deg', 'elevation_ft',
    'country_name', 'region_name', 'airtanker_base'
]]

# Write to CSV
airports_processed.to_csv('../data/processed_data/airports_processed.csv', index=False)

In [137]:
# just runway information
# Build a new dataframe from the multi-row, pre-aggregated data
runways_processed = filtered_sorted[['ident', 'length_ft', 'surface']].copy()

# Rename columns for clarity
runways_processed = runways_processed.rename(columns={
    'length_ft': 'runway_length',
    'surface': 'runway_surface'
})

# Optionally add a unique runway ID
runways_processed.reset_index(drop=True, inplace=True)
runways_processed.insert(0, 'runway_id', runways_processed.index + 1)  # Simple integer ID

# Write to CSV
runways_processed.to_csv('../data/processed_data/runways_processed.csv', index=False)


In [138]:
# View result
filtered

Unnamed: 0,ident,iata_code,icao_code,local_code,name,type,latitude_deg,longitude_deg,elevation_ft,country_name,region_name,runway_lengths_ft,runway_surfaces,airtanker_base
0,0R7,,,0R7,The Red River Airport,small_airport,31.990700,-93.307404,177.0,United States,Louisiana,5000,asphalt,False
1,5K2,,,5K2,Tribune Municipal Airport,small_airport,38.450901,-101.750135,3620.0,United States,Kansas,5000,grooved concrete,False
2,C56,,,C56,Bult Field,small_airport,41.377602,-87.681396,790.0,United States,Illinois,5001,grooved concrete,False
3,D38,IUA,,IUA,Canandaigua Airport,small_airport,42.908902,-77.325226,814.0,United States,New York,5500,asphalt,False
4,FWB,,,FWB,Branson West Airport,small_airport,36.698497,-93.402249,1348.0,United States,Missouri,5000,grooved concrete,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1389,PHMU,MUE,PHMU,MUE,Waimea Kohala Airport,medium_airport,20.001301,-155.667999,2671.0,United States,Hawaii,5197,asphalt,False
1390,PHNL,HNL,PHNL,HNL,Daniel K Inouye International Airport,large_airport,21.320620,-157.924228,13.0,United States,Hawaii,695290001200012300,"asphalt,asphalt,asphalt,asphalt",False
1391,PHNY,LNY,PHNY,LNY,Lanai Airport,medium_airport,20.785675,-156.951324,1308.0,United States,Hawaii,5001,asphalt,False
1392,PHOG,OGG,PHOG,OGG,Kahului International Airport,large_airport,20.896263,-156.431837,54.0,United States,Hawaii,6995,asphalt,False


In [139]:
# view structure
filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1394 entries, 0 to 1393
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ident              1394 non-null   object 
 1   iata_code          1006 non-null   object 
 2   icao_code          965 non-null    object 
 3   local_code         1394 non-null   object 
 4   name               1394 non-null   object 
 5   type               1394 non-null   object 
 6   latitude_deg       1394 non-null   float64
 7   longitude_deg      1394 non-null   float64
 8   elevation_ft       1394 non-null   float64
 9   country_name       1394 non-null   object 
 10  region_name        1394 non-null   object 
 11  runway_lengths_ft  1394 non-null   object 
 12  runway_surfaces    1394 non-null   object 
 13  airtanker_base     1394 non-null   bool   
dtypes: bool(1), float64(3), object(10)
memory usage: 143.1+ KB


In [140]:
# summary statistics
filtered.describe()

Unnamed: 0,latitude_deg,longitude_deg,elevation_ft
count,1394.0,1394.0,1394.0
mean,38.133323,-96.259575,1548.314921
std,6.419512,17.375358,1903.408182
min,19.721399,-176.642783,-115.0
25%,33.926611,-107.747248,285.75
50%,37.5243,-91.87405,765.0
75%,41.9091,-83.424052,1797.0
max,71.285402,-67.792099,9927.0


In [141]:
# Count null values in each column of the filtered dataset
null_counts = filtered.isnull().sum()
# Print the result
print("Null values in each column:")
print(null_counts.sort_values(ascending=False))

Null values in each column:
icao_code            429
iata_code            388
ident                  0
local_code             0
name                   0
type                   0
latitude_deg           0
longitude_deg          0
elevation_ft           0
country_name           0
region_name            0
runway_lengths_ft      0
runway_surfaces        0
airtanker_base         0
dtype: int64


In [142]:
# create interactive map of airports
columns = filtered.columns.tolist()

locations = filtered.dropna(subset=['latitude_deg', 'longitude_deg'])

# Center the map
m = folium.Map(location=[39.5, -98.35], zoom_start=4)

# Add airport points with conditional color for airtanker bases
for _, row in locations.iterrows():
    popup_html = "<br>".join([f"<b>{col}:</b> {row[col]}" for col in columns])
    
    color = 'purple' if row.get('airtanker_base') == 1 else 'blue'
    
    folium.CircleMarker(
        location=[row['latitude_deg'], row['longitude_deg']],
        radius=5,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=folium.Popup(popup_html, max_width=300),
        tooltip=row['name']
    ).add_to(m)

# Display the map (or save with m.save("filename.html"))
m
