In [3]:
import pandas as pd

In [4]:
# Store filepath in a variable
data = "Resources/Airline_Delay_Cause.csv"

In [5]:
# Read Data file with the pandas library

data_df = pd.read_csv(data, encoding="ISO-8859-1")

In [6]:
# Preview the data within the Panda Dataframe

data_df.head()

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,...,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2022,5,9E,Endeavor Air Inc.,ABE,"Allentown/Bethlehem/Easton, PA: Lehigh Valley ...",136.0,7.0,5.95,0.0,...,0.0,1.0,0.0,0.0,255.0,222.0,0.0,4.0,0.0,29.0
1,2022,5,9E,Endeavor Air Inc.,ABY,"Albany, GA: Southwest Georgia Regional",91.0,16.0,7.38,0.0,...,0.0,6.09,0.0,0.0,884.0,351.0,0.0,81.0,0.0,452.0
2,2022,5,9E,Endeavor Air Inc.,ACK,"Nantucket, MA: Nantucket Memorial",19.0,2.0,0.13,0.0,...,0.0,0.88,1.0,0.0,138.0,4.0,0.0,106.0,0.0,28.0
3,2022,5,9E,Endeavor Air Inc.,AEX,"Alexandria, LA: Alexandria International",88.0,14.0,7.26,0.76,...,0.0,1.64,0.0,0.0,947.0,585.0,35.0,125.0,0.0,202.0
4,2022,5,9E,Endeavor Air Inc.,AGS,"Augusta, GA: Augusta Regional at Bush Field",181.0,19.0,13.84,0.0,...,0.0,2.09,0.0,0.0,808.0,662.0,0.0,87.0,0.0,59.0


In [7]:
# split() up the airport_name column to parse out City(s), State, and Airport name.  The State data will be used for the params in the API call. 

# new data frame with split value columns. Starting with the city name. 
new = data_df["airport_name"].str.split(",", n = 1, expand = True)
 
# making separate city column from new data frame
data_df["city"]= new[0]
 
# making separate column to parse out state from airport name
data_df["split1"]= new[1]

# Dropping old airport_name column
data_df.drop(columns =["airport_name"], inplace = True)
 


In [8]:
# Display updated dataframe with city name in it's own column. 
data_df.head(1)

Unnamed: 0,year,month,carrier,carrier_name,airport,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,...,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,city,split1
0,2022,5,9E,Endeavor Air Inc.,ABE,136.0,7.0,5.95,0.0,0.05,...,0.0,0.0,255.0,222.0,0.0,4.0,0.0,29.0,Allentown/Bethlehem/Easton,PA: Lehigh Valley International


In [9]:
# using the same split() method, we're now splitting the state and airport name.
new1 = data_df["split1"].str.split(":", n = 1, expand = True)
 
# making separate city column from new data frame
data_df["state"]= new1[0]
 
# making separate column to parse out state from airport name
data_df["airport_name"]= new1[1]

# Dropping old airport_name column
data_df.drop(columns =["split1"], inplace = True)

In [10]:
#confirm that the old airport_name column with city, state, and airport name are now in their own columns (far right columns). 
data_df.head(1)

Unnamed: 0,year,month,carrier,carrier_name,airport,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,...,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,city,state,airport_name
0,2022,5,9E,Endeavor Air Inc.,ABE,136.0,7.0,5.95,0.0,0.05,...,0.0,255.0,222.0,0.0,4.0,0.0,29.0,Allentown/Bethlehem/Easton,PA,Lehigh Valley International


In [24]:
#create a dataframe with only the airport information used for API calls.
# 1. remove any duplicates from the dataframe
# 2. Reset the index from the previous dataframe (otherwise, for example, the index number would range from 1 to 3k)
# 3. Pull only the columns we want from the previous dataframe. Airport Code, Airport Name. State. City. 
# 4. Rename columns to better describe the data

airport_df = data_df.drop_duplicates(["airport_name", "state", "city"]).reset_index(drop=True)[['airport', 'airport_name', 'state', 'city']]
airport_df = pd.DataFrame(airport_df.rename(columns={'airport':'airport_code'}))
airport_df

Unnamed: 0,airport_code,airport_name,state,city
0,ABE,Lehigh Valley International,PA,Allentown/Bethlehem/Easton
1,ABY,Southwest Georgia Regional,GA,Albany
2,ACK,Nantucket Memorial,MA,Nantucket
3,AEX,Alexandria International,LA,Alexandria
4,AGS,Augusta Regional at Bush Field,GA,Augusta
...,...,...,...,...
415,MKK,Molokai,HI,Hoolehua
416,ILE,Skylark Field,TX,Killeen
417,SKA,Fairchild AFB,WA,Spokane
418,CBM,Columbus AFB,MS,Columbus


In [25]:
#export csv file 
airport_df.to_csv('Resources/airports.csv')

In [26]:
#export json file 
airport_df.to_json('Resources/airports.json')

In [29]:
#create a dataframe for monthly data enteries of weather delays by carrier and airport.
# 1. Grab only the columns of data we need.
# 2. Rename columns to better describe the data

airline_data_df = data_df[['year', 'month', 'carrier_name', 'carrier', 'airport', 'arr_flights', 'weather_delay']]
airline_data_df = pd.DataFrame(airline_data_df.rename(columns={'carrier':'carrier_code', 'airport':'airport_code', 'arr_flights':'arrival_flights'}))
airline_data_df

Unnamed: 0,year,month,carrier_name,carrier_code,airport_code,arrival_flights,weather_delay
0,2022,5,Endeavor Air Inc.,9E,ABE,136.0,0.0
1,2022,5,Endeavor Air Inc.,9E,ABY,91.0,0.0
2,2022,5,Endeavor Air Inc.,9E,ACK,19.0,0.0
3,2022,5,Endeavor Air Inc.,9E,AEX,88.0,35.0
4,2022,5,Endeavor Air Inc.,9E,AGS,181.0,0.0
...,...,...,...,...,...,...,...
318012,2003,6,Delta Air Lines Inc.,DL,SEA,480.0,164.0
318013,2003,6,Delta Air Lines Inc.,DL,SFO,505.0,138.0
318014,2003,6,Delta Air Lines Inc.,DL,SJC,146.0,0.0
318015,2003,6,Delta Air Lines Inc.,DL,SJU,95.0,0.0


In [30]:
#export csv file 
airline_data_df.to_csv('Resources/airline_data.csv')

In [31]:
#export json file 
airline_data_df.to_json('Resources/airline_data.json')