In [1]:
import pandas as pd
import streamlit as st
import plotly.express as px


# Load the dataset
data = pd.read_csv("Combined_Flights_2022.csv")
state_codes = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    'District of Columbia': 'DC',
    'Puerto Rico': 'PR',
    'U.S. Virgin Islands': 'VI',
    'Guam': 'GU',
    'American Samoa': 'AS',
    'Northern Mariana Islands': 'MP'
}

# Filter the data for delayed flights
delayed_flights = data[(data['DepDelayMinutes'] > 0) | (data['Diverted'] == 1) | (data['Cancelled'] == 1)]

# Calculate the delay ratio for each destination state
grouped_data = delayed_flights.groupby('DestStateName').size().reset_index(name='delayed_count')
total_flights = data.groupby('DestStateName').size().reset_index(name='total_count')
state_data = pd.merge(grouped_data, total_flights, on='DestStateName')
state_data['delay_ratio'] = state_data['delayed_count'] / state_data['total_count']

# Filter state_data to only include rows where DestStateName is in the state_codes dictionary
state_data = state_data[state_data['DestStateName'].isin(state_codes.keys())]

# Create a new column in the state_data DataFrame containing state abbreviations
state_data['state_code'] = state_data['DestStateName'].apply(lambda x: state_codes[x])

# Create the interactive map chart using Plotly
fig = px.choropleth(
    state_data,
    locations='state_code',
    locationmode='USA-states',
    color='delay_ratio',
    scope='usa',
    color_continuous_scale="Viridis_r",
    labels={'delay_ratio': 'Delay Ratio'},
    title='Flight Delay Ratio by Destination State'
)

# User inputs
st.title("Flight Delay Predictor")
selected_state = st.selectbox("Select State", state_codes.keys())
selected_airport = st.selectbox("Select Airport", data["Dest"].unique())
selected_airline = st.selectbox("Select Airline", data["Airline"].unique())


# Start button
if st.button("Start"):
    # Filter the data based on user inputs
    user_filtered_data = data[(data["Airline"] == selected_airline) & (data["Dest"] == selected_airport) & (data["DestStateName"] == selected_state)]

    # Calculate the probability of delay
    delayed_user_filtered_data = user_filtered_data[(user_filtered_data['DepDelayMinutes'] > 0) | (user_filtered_data['Diverted'] == 1) | (user_filtered_data['Cancelled'] == 1)]
    delay_probability = len(delayed_user_filtered_data) / len(user_filtered_data) if len(user_filtered_data) > 0 else 0

    # Display the result with larger font size
    st.markdown(f"<h3>Airline: {selected_airline}, Airport: {selected_airport}, State: {selected_state}</h3>", unsafe_allow_html=True)
    st.markdown(f"<h3>Probability of delay: {delay_probability * 100:.2f}%</h3>", unsafe_allow_html=True)


# Display the interactive choropleth map
st.plotly_chart(fig)


2023-04-19 18:08:27.004 
  command:

    streamlit run C:\Users\ghe53\anaconda3\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [2]:
import pandas as pd
df = pd.read_csv("Combined_Flights_2022.csv")
# Assuming 'df' is the DataFrame containing the flight delay dataset
# Set the threshold for departure delay (in minutes)
delay_threshold = 15

# Define a function to determine if a flight is delayed based on the given conditions
def is_delayed(row):
    return (row['DepDelayMinutes'] > delay_threshold) or (row['Diverted'] == 1) or (row['Cancelled'] == 1)

# Apply the is_delayed function to each row in the DataFrame
df['Delayed'] = df.apply(is_delayed, axis=1)

# Group the dataset by origin and destination airports, and count the number of delayed flights for each route
delayed_routes = df[df['Delayed']].groupby(['Origin', 'Dest']).size().reset_index(name='NumDelays')

# Sort the results to find the route with the highest number of delays
most_delayed_route = delayed_routes.sort_values('NumDelays', ascending=False).iloc[0]

# Print the result
print(f"The route with the most delays is {most_delayed_route['Origin']} to {most_delayed_route['Dest']} with {most_delayed_route['NumDelays']} delays.")


The route with the most delays is ORD to LGA with 1962 delays.


In [3]:
import pandas as pd

# Assuming 'df' is the DataFrame containing the flight delay dataset
# Set the threshold for departure delay (in minutes)
delay_threshold = 15

# Define a function to determine if a flight is delayed based on the given conditions
def is_delayed(row):
    return (row['DepDelayMinutes'] > delay_threshold) or (row['Diverted'] == 1) or (row['Cancelled'] == 1)

# Apply the is_delayed function to each row in the DataFrame
df['Delayed'] = df.apply(is_delayed, axis=1)

# Group the dataset by origin and destination airports
# Count the number of flights and delayed flights for each route
route_summary = df.groupby(['Origin', 'Dest']).agg({'Delayed': ['sum', 'count']}).reset_index()
route_summary.columns = ['Origin', 'Dest', 'NumDelays', 'TotalFlights']

# Calculate the delay probability for each route
route_summary['DelayProbability'] = route_summary['NumDelays'] / route_summary['TotalFlights']

# Find the route with the highest delay probability
worst_delay_route = route_summary.sort_values('DelayProbability', ascending=False).iloc[0]

# Print the result
print(f"The route with the highest delay probability is {worst_delay_route['Origin']} to {worst_delay_route['Dest']} with a delay probability of {worst_delay_route['DelayProbability']:.2%}.")

The route with the highest delay probability is HOU to RNO with a delay probability of 100.00%.
