In [1]:
import pandas as pd
from sqlalchemy import create_engine
from Resources.config import sqlpass
import datetime


In [2]:
#read the csv files in
airport = "Resources/airports.csv"
flights = "Resources/flights.csv"

#create df for airports
airport_df = pd.read_csv(airport)

#view the data
airport_df.head()


Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,iso_country,iso_region,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords
0,6523,00A,heliport,Total Rf Heliport,40.070801,-74.933601,11.0,,US,US-PA,Bensalem,no,00A,,00A,,,
1,323361,00AA,small_airport,Aero B Ranch Airport,38.704022,-101.473911,3435.0,,US,US-KS,Leoti,no,00AA,,00AA,,,
2,6524,00AK,small_airport,Lowell Field,59.9492,-151.695999,450.0,,US,US-AK,Anchor Point,no,00AK,,00AK,,,
3,6525,00AL,small_airport,Epps Airpark,34.864799,-86.770302,820.0,,US,US-AL,Harvest,no,00AL,,00AL,,,
4,6526,00AR,closed,Newport Hospital & Clinic Heliport,35.6087,-91.254898,237.0,,US,US-AR,Newport,no,,,,,,00AR


In [3]:
#create df for flights
flights_df = pd.read_csv(flights)

#view the data
flights_df.head()


Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [4]:
#clean the airport df to only read the state of NY

NY_airports = airport_df.loc[airport_df['iso_region'] == 'US-NY']


#remove columns with duplicate information or blank

cleaned = NY_airports.drop(columns=['continent','home_link', 'wikipedia_link','keywords'], axis=1)

#remove all rows that have NaN in 'iata_code' to get rid of heleports, local hangers, ect...
airport_cleaned= cleaned[pd.notnull(cleaned['iata_code'])]
# airport_cleaned.head()


In [5]:
#clean the flight data by removing unnecessary columns
cleaned_flights = flights_df.drop(columns=['TAXI_OUT', 'WHEELS_OFF','WHEELS_ON', 'TAXI_IN', ])
cleaned_flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [6]:
#remove all origin airport or destination airports that are not in the airports dataframe by iata_code
NY_flights = cleaned_flights[(cleaned_flights['ORIGIN_AIRPORT'].isin(airport_cleaned['iata_code'])) | (cleaned_flights['DESTINATION_AIRPORT'].isin(airport_cleaned['iata_code']))] 
NY_flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
39,2015,1,1,4,B6,304,N607JB,SJU,JFK,155,...,501.0,11.0,0,0,,,,,,
40,2015,1,1,4,NK,451,N633NK,PBG,FLL,155,...,450.0,-33.0,0,0,,,,,,
44,2015,1,1,4,NK,647,N630NK,IAG,FLL,200,...,453.0,-11.0,0,0,,,,,,
95,2015,1,1,4,B6,2001,N358JB,BUF,JFK,535,...,648.0,-15.0,0,0,,,,,,
96,2015,1,1,4,B6,2807,N190JB,PWM,JFK,535,...,635.0,-25.0,0,0,,,,,,


In [7]:
#create new Date column
NY_flights['DATE'] = ""
NY_flights.head()

#Merge Year, Month and Day columns together 

NY_flights["DATE"]=NY_flights.apply(lambda x:'%s-%s-%s' % (x['YEAR'],x['MONTH'], x['DAY']),axis=1)
#convert date column to datetime format
NY_flights['DATE'] = pd.to_datetime(NY_flights['DATE'])

#Drop the Year, Month and Day columns as they are now unnecessary
NY_flights.drop(columns=['YEAR', 'MONTH','DAY'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NY_flights['DATE'] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NY_flights["DATE"]=NY_flights.apply(lambda x:'%s-%s-%s' % (x['YEAR'],x['MONTH'], x['DAY']),axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NY_flights['DATE'] = pd.to_datetime(NY_flights['DATE'])
A value is trying to be se

In [8]:
# reorder the columns
NY_flights = NY_flights[['DATE', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',\
                        'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',\
                        'SCHEDULED_DEPARTURE','DEPARTURE_TIME','DEPARTURE_DELAY',\
                        'SCHEDULED_TIME', 'ELAPSED_TIME','AIR_TIME',\
                         'DISTANCE','SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',\
                         'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED',\
                        'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY',\
                        'LATE_AIRCRAFT_DELAY','WEATHER_DELAY']]


NY_flights.head(50)

Unnamed: 0,DATE,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
39,2015-01-01,4,B6,304,N607JB,SJU,JFK,155,153.0,-2.0,...,501.0,11.0,0,0,,,,,,
40,2015-01-01,4,NK,451,N633NK,PBG,FLL,155,139.0,-16.0,...,450.0,-33.0,0,0,,,,,,
44,2015-01-01,4,NK,647,N630NK,IAG,FLL,200,155.0,-5.0,...,453.0,-11.0,0,0,,,,,,
95,2015-01-01,4,B6,2001,N358JB,BUF,JFK,535,530.0,-5.0,...,648.0,-15.0,0,0,,,,,,
96,2015-01-01,4,B6,2807,N190JB,PWM,JFK,535,528.0,-7.0,...,635.0,-25.0,0,0,,,,,,
97,2015-01-01,4,B6,2023,N324JB,JFK,SJU,535,618.0,43.0,...,1039.0,19.0,0,0,,0.0,0.0,19.0,0.0,0.0
120,2015-01-01,4,AA,2299,N3LLAA,JFK,MIA,545,640.0,55.0,...,959.0,69.0,0,0,,14.0,0.0,55.0,0.0,0.0
122,2015-01-01,4,B6,917,N606JB,BOS,JFK,545,543.0,-2.0,...,649.0,-13.0,0,0,,,,,,
123,2015-01-01,4,B6,939,N794JB,JFK,BQN,545,545.0,0.0,...,1007.0,-19.0,0,0,,,,,,
133,2015-01-01,4,B6,515,N337JB,BOS,BUF,549,540.0,-9.0,...,719.0,-6.0,0,0,,,,,,


In [9]:
#create connection to engine
engine = create_engine(f'postgresql://postgres:{sqlpass}@localhost:5432/airport_weather_delays')
connection = engine.connect()

In [10]:
#check for tables
engine.table_names()

['Flights', 'Airports', 'historical_weather']

In [21]:
#load flight data into database
NY_flights.to_sql(name='Flights', con=engine, if_exists='append', index=False)


In [22]:
#load airport data into database
airport_cleaned.to_sql(name='Airports', con=engine, if_exists='append', index=False)

In [None]:
#close the database connection
connection.close()