In [1]:
import pandas as pd
import numpy as np
import collections
import re
import sqlite3

In [2]:
# Change this to your path to the DB file!
database = "flights.db"
conn = sqlite3.connect(database)

### Load raw data 

In [3]:
# load our datasets
# These first four are used as lookup tables
airlines =     pd.read_sql('SELECT * FROM airlines', con=conn)
airports =     pd.read_sql('SELECT * FROM airports', con=conn)
cancel_codes = pd.read_sql('SELECT * FROM cancel_codes', con=conn)
carriers =     pd.read_sql('SELECT * FROM carriers', con=conn)
# This has all the data about flights
flights =      pd.read_sql('SELECT * FROM flights', con=conn)

In [4]:
#Closing the connection to the db, since we're done using it
conn.close()

### Some transforms to get it ready for analysis

In [5]:
# Turn our flights table into a list of dicts
# Normally we wouldn't do this, but for the sake of learning to deal with data
# structures in Python, we'll do it this way :)
flights = flights.to_dict(orient='records')

In [6]:
type(flights)

list

In [7]:
#flights is currently a list of dicts technically stored in an array. 
#It has 410,517 elements in it.
print(len(flights))

410517


In [8]:
#viewing and inspecting the first element.
flights[0]

{'DAY_OF_MONTH': 1,
 'DAY_OF_WEEK': 3,
 'FL_DATE': '2017-02-01',
 'UNIQUE_CARRIER': 'B6',
 'AIRLINE_ID': 20409,
 'FL_NUM': 33,
 'ORIGIN_AIRPORT_ID': 10785,
 'ORIGIN_CITY_NAME': 'Burlington, VT',
 'ORIGIN_STATE_NM': 'Vermont',
 'DEST_AIRPORT_ID': 12478,
 'DEST_CITY_NAME': 'New York, NY',
 'DEST_STATE_NM': 'New York',
 'DEP_TIME': 1857.0,
 'DEP_DELAY': -10.0,
 'TAXI_OUT': 16.0,
 'WHEELS_OFF': 1913.0,
 'WHEELS_ON': 2019.0,
 'TAXI_IN': 10.0,
 'ARR_TIME': 2029.0,
 'ARR_DELAY': -8.0,
 'CANCELLED': 0,
 'CANCELLATION_CODE': None,
 'DIVERTED': 0,
 'ACTUAL_ELAPSED_TIME': 92.0,
 'FLIGHTS': 1,
 'DISTANCE': 266,
 'CARRIER_DELAY': nan,
 'WEATHER_DELAY': nan,
 'NAS_DELAY': nan,
 'SECURITY_DELAY': nan,
 'LATE_AIRCRAFT_DELAY': nan,
 'Unnamed: 31': None}

### HW 1: Answer the following questions about the `flights` :

What datatype is our `flights` object now (a list, dict, etc)?

How many elements are in it?

How can we look at/inspect the first element?

### HW 2: Create dictionaries from our lookup tables, making the `Code` the key and the `Description` the value

In [9]:
cancel_codes_dict= dict(zip(cancel_codes.Code, cancel_codes.Description))

# Go ahead and create more dictionaries for airlines, airports and carriers where
# for each the key will be the `Code` and the value will be the `Description`

airlines_dict = dict(zip(airlines.Code, airlines.Description))

airports_dict = dict(zip(airports.Code, airports.Description))

carriers_dict = dict(zip(carriers.Code, carriers.Description))
# When you're done there should be four dicts (cancel_codes_dict, airlines_dict, airports_dict, carriers_dict)

### HW 3: Add in our dictionary mappings to our `flights` data
Using our mappings, please create the following new keys in each 'row' of flights with the appropriate value:
 - AIRLINE_NAME
 - ORIGIN_AIRPORT_NAME
 - CARRIER_NAME
 - CANCELLATION_REASON

In [10]:
cancel_codes_dict

{'A': 'Carrier', 'B': 'Weather', 'C': 'National Air System', 'D': 'Security'}

In [11]:
cancel_codes_dict['None'] = 'None'
cancel_codes_dict

{'A': 'Carrier',
 'B': 'Weather',
 'C': 'National Air System',
 'D': 'Security',
 'None': 'None'}

In [12]:
for row_dict in flights:
    carrier_id = row_dict['UNIQUE_CARRIER']
    carrier_nm = carriers_dict[carrier_id]
    row_dict['CARRIER_NAME'] = carriers_dict.get(row_dict['UNIQUE_CARRIER'])
    
    airline_id = row_dict['AIRLINE_ID']
    airline_nm = airlines_dict[airline_id]
    row_dict['AIRLINE_NAME'] = airlines_dict.get(row_dict['AIRLINE_ID'])
    
    origin_id = row_dict['ORIGIN_AIRPORT_ID']
    origin_nm = airports_dict[origin_id]
    row_dict['ORIGIN_AIRPORT_NAME'] = airports_dict.get(row_dict['ORIGIN_AIRPORT_ID'])
    
    #cancellation_cd = row_dict['CANCELLATION_CODE']
    #cancellation_nm = cancel_codes_dict[cancellation_cd]
    #row_dict['CANCELLATION_REASON'] = cancel_codes_dict.get(row_dict['CANCELLATION_CODE'])
    #I added 'None' to the dictionary, but it did not work the way I expected.
    #Stuck here. Moving on to HW #2

In [13]:
flights[0]

{'DAY_OF_MONTH': 1,
 'DAY_OF_WEEK': 3,
 'FL_DATE': '2017-02-01',
 'UNIQUE_CARRIER': 'B6',
 'AIRLINE_ID': 20409,
 'FL_NUM': 33,
 'ORIGIN_AIRPORT_ID': 10785,
 'ORIGIN_CITY_NAME': 'Burlington, VT',
 'ORIGIN_STATE_NM': 'Vermont',
 'DEST_AIRPORT_ID': 12478,
 'DEST_CITY_NAME': 'New York, NY',
 'DEST_STATE_NM': 'New York',
 'DEP_TIME': 1857.0,
 'DEP_DELAY': -10.0,
 'TAXI_OUT': 16.0,
 'WHEELS_OFF': 1913.0,
 'WHEELS_ON': 2019.0,
 'TAXI_IN': 10.0,
 'ARR_TIME': 2029.0,
 'ARR_DELAY': -8.0,
 'CANCELLED': 0,
 'CANCELLATION_CODE': None,
 'DIVERTED': 0,
 'ACTUAL_ELAPSED_TIME': 92.0,
 'FLIGHTS': 1,
 'DISTANCE': 266,
 'CARRIER_DELAY': nan,
 'WEATHER_DELAY': nan,
 'NAS_DELAY': nan,
 'SECURITY_DELAY': nan,
 'LATE_AIRCRAFT_DELAY': nan,
 'Unnamed: 31': None,
 'CARRIER_NAME': 'JetBlue Airways',
 'AIRLINE_NAME': 'JetBlue Airways: B6',
 'ORIGIN_AIRPORT_NAME': 'Burlington, VT: Burlington International'}

# Homework 2

In [14]:
#What are the unique airlines in the list of flights?
unique_airline = { row['AIRLINE_NAME'] for row in flights}

In [15]:
unique_airline

{'Alaska Airlines Inc.: AS',
 'American Airlines Inc.: AA',
 'Delta Air Lines Inc.: DL',
 'ExpressJet Airlines Inc.: EV',
 'Frontier Airlines Inc.: F9',
 'Hawaiian Airlines Inc.: HA',
 'JetBlue Airways: B6',
 'SkyWest Airlines Inc.: OO',
 'Southwest Airlines Co.: WN',
 'Spirit Air Lines: NK',
 'United Air Lines Inc.: UA',
 'Virgin America: VX'}

In [16]:
x = { row['DEP_DELAY'] for row in flights}

In [17]:
x

{0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 nan,
 15.0,
 14.0,
 17.0,
 18.0,
 19.0,
 16.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 nan,
 31.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 nan,
 32.0,
 40.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 20.0,
 103.0,
 105.0,
 104.0,
 106.0,
 108.0,
 109.0,
 110.0,
 111.0,
 112.0,
 113.0,
 114.0,
 115.0,
 116.0,
 117.0,
 118.0,
 119.0,
 120.0,
 121.0,
 122.0,
 123.0,
 124.0,
 125.0,
 126.0,
 127.0,
 128.0,
 129.0,
 130.0,
 131.0,
 132.0,
 133.0,
 134.0,
 135.0,
 136.0,
 137.0,
 138.0,
 1

In [18]:
long_delay = [ row for row in flights if row['DEP_DELAY'] > 1000]

In [19]:
len(long_delay)

55

In [20]:
long_delay_pick = [ row for row in flights if row['DEP_DELAY'] > 1400]

In [21]:
len(long_delay_pick)

6

In [22]:
long_delay_pick

[{'DAY_OF_MONTH': 7,
  'DAY_OF_WEEK': 2,
  'FL_DATE': '2017-02-07',
  'UNIQUE_CARRIER': 'AA',
  'AIRLINE_ID': 19805,
  'FL_NUM': 2195,
  'ORIGIN_AIRPORT_ID': 12441,
  'ORIGIN_CITY_NAME': 'Jackson, WY',
  'ORIGIN_STATE_NM': 'Wyoming',
  'DEST_AIRPORT_ID': 11298,
  'DEST_CITY_NAME': 'Dallas/Fort Worth, TX',
  'DEST_STATE_NM': 'Texas',
  'DEP_TIME': 1242.0,
  'DEP_DELAY': 1410.0,
  'TAXI_OUT': 16.0,
  'WHEELS_OFF': 1258.0,
  'WHEELS_ON': 1552.0,
  'TAXI_IN': 6.0,
  'ARR_TIME': 1558.0,
  'ARR_DELAY': 1382.0,
  'CANCELLED': 0,
  'CANCELLATION_CODE': None,
  'DIVERTED': 0,
  'ACTUAL_ELAPSED_TIME': 136.0,
  'FLIGHTS': 1,
  'DISTANCE': 1047,
  'CARRIER_DELAY': 76.0,
  'WEATHER_DELAY': 0.0,
  'NAS_DELAY': 0.0,
  'SECURITY_DELAY': 0.0,
  'LATE_AIRCRAFT_DELAY': 1306.0,
  'Unnamed: 31': None,
  'CARRIER_NAME': 'American Airlines Inc.',
  'AIRLINE_NAME': 'American Airlines Inc.: AA',
  'ORIGIN_AIRPORT_NAME': 'Jackson, WY: Jackson Hole'},
 {'DAY_OF_MONTH': 10,
  'DAY_OF_WEEK': 5,
  'FL_DATE': '2017-

In [33]:
#Which flight had the longest departure delay?
longest_delay=None
max_idx=None

for idx, num in long_delay_pick:
    if (longest_delay is None or num > longest_delay):
        longest_delay = num
        max_idx = idx
        
        
    print('Maximum value:' longest_delay, 'at index:', max_idx)

SyntaxError: invalid syntax (Temp/ipykernel_27800/4194098965.py, line 11)