In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('flights.csv')

# Display the first few rows
df.head()


In [None]:
# Identify null values
null_values = df.isnull().sum()
null_values


In [None]:
# replace null values with 0
null_values.fillna(0, inplace=True)

In [None]:
# show departures that were delayed by more than 15 minutes
df[df['DepDel15'].isnull()]['DepDelay']

#update DelDel15 to 0 if null
df['DepDel15'].fillna(0, inplace=True)

In [None]:
# normalize the data by removing outliers from depdelay and arrdelay

# Calculate the z-scores of depdelay and arrdelay
z_scores = (df[['DepDelay', 'ArrDelay']] - df[['DepDelay', 'ArrDelay']].mean()) / df[['DepDelay', 'ArrDelay']].std()

# Calculate the absolute z-scores
abs_z_scores = z_scores.abs()

# Get the rows with outliers
outliers = (abs_z_scores > 3).any(axis=1)

# Remove the outliers
df = df[~outliers]


In [None]:
# show newly cleaned data
df.head(20)

In [None]:
# Create a chatrt showing average departure delay by airline
import matplotlib.pyplot as plt
import seaborn as sns

# Group by airline and calculate average departure delay
average_delay = df.groupby('Carrier')['DepDelay'].mean()

# Sort the average delay in ascending order
average_delay = average_delay.sort_values(ascending=True)

# Create a bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=average_delay.index, y=average_delay.values, palette='viridis')
plt.title('Average Departure Delay by Airline')
plt.ylabel('Average Delay (minutes)')
plt.xlabel('Airline')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Create a bar chart showing arrival delay that was greater than 15 minutes by airline

# Group by airline and calculate the number of delayed flights
delayed_flights = df[df['ArrDel15'] == 1].groupby('Carrier').size()

# Sort the number of delayed flights in ascending order
delayed_flights = delayed_flights.sort_values(ascending=True)

# Create a bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=delayed_flights.index, y=delayed_flights.values, palette='viridis')
plt.title('Number of Flights with Arrival Delay > 15 minutes by Airline')
plt.ylabel('Number of Flights')
plt.xlabel('Airline')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Plot the distribution of DepDelay and ArrDelay
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='DepDelay', kde=True, color='blue', label='DepDelay')
sns.histplot(data=df, x='ArrDelay', kde=True, color='green', label='ArrDelay')
plt.title('Distribution of DepDelay and ArrDelay')
plt.xlabel('Delay (minutes)')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
# Calculate the average arrival delay by carrier
average_arrival_delay = df.groupby('Carrier')['ArrDelay'].mean()

# Sort the average arrival delay in ascending order
average_arrival_delay = average_arrival_delay.sort_values(ascending=True)

# Create a bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=average_arrival_delay.index, y=average_arrival_delay.values, palette='viridis')
plt.title('Average Arrival Delay by Airline')
plt.ylabel('Average Delay (minutes)')
plt.xlabel('Airline')
plt.xticks(rotation=45)
plt.show()


In [None]:
# which route has the highest average arrival delay
average_arrival_delay_route = df.groupby(['OriginCity', 'DestCity'])['ArrDelay'].mean()
average_arrival_delay_route


In [None]:
# which route has the most late arrivals from origin airport to destination airport?

# Group by origin and destination cities and calculate the number of delayed flights
delayed_flights_route = df[df['ArrDel15'] == 1].groupby(['OriginCity', 'DestCity']).size()

# Sort the number of delayed flights in ascending order
delayed_flights_route = delayed_flights_route.sort_values(ascending=True)

# Display the top 10 routes with the most delayed flights
delayed_flights_route.tail(10)

In [None]:
# which departure airport has the highest average departure delay?
average_departure_delay_airport = df.groupby('OriginCity')['DepDelay'].mean()

# Sort the average departure delay in ascending order
average_departure_delay_airport = average_departure_delay_airport.sort_values(ascending=True)
average_departure_delay_airport

In [None]:
# what are the arrival delays for different days of teh week?

# Group by day of the week and calculate the average arrival delay
average_arrival_delay_day = df.groupby('DayOfWeek')['ArrDelay'].mean()

# Create a bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=average_arrival_delay_day.index, y=average_arrival_delay_day.values, palette='viridis')
plt.title('Average Arrival Delay by Day of the Week')
plt.ylabel('Average Delay (minutes)')
plt.xlabel('Day of the Week')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df['DepDelay'], df['ArrDelay'])
plt.title('Relationship between Late Departures and Arrival Delays')
plt.xlabel('Departure Delay (minutes)')
plt.ylabel('Arrival Delay (minutes)')
plt.show()


In [None]:
# Create a model to predict the likelihood of a flight being delayed based on the day of the week and the arrival airport
# use a logistic regression model
# split the date into training and testing sets
# train the model
# calculate the accuracy of the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Select the features and the target variable
X = df[['DayOfWeek', 'DestAirportID']]
y = df['ArrDel15']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create a logistic regression model
model = LogisticRegression(solver='lbfgs')

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
model.score(X_test, y_test)


In [None]:
# show confusion matrix
from sklearn.metrics import confusion_matrix

# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

In [None]:
# show the odds of a flight being delayed to Chicago on a Monday

# Create a dataframe with the input values
data = {'DayOfWeek': [1], 'DestAirportID': [13930]}
input_values = pd.DataFrame(data)

# Make a prediction
prediction = model.predict_proba(input_values)

# Show the prediction
prediction[0]

In [None]:
# Show the odds of a flight being delayed to Las Vegas on a Friday

# Create a dataframe with the input values
data = {'DayOfWeek': [5], 'DestAirportID': [12889]}
input_values = pd.DataFrame(data)

# Make a prediction
prediction = model.predict_proba(input_values)

# Show the prediction
prediction[0]

In [None]:
# Make a prediction of the odds of a flight being delayed to Los Angeles on a Wednesday

# Create a dataframe with the input values
data = {'DayOfWeek': [3], 'DestAirportID': [12892]}

# Make a prediction
input_values = pd.DataFrame(data)

# Make a prediction
prediction = model.predict_proba(input_values)

# Show the prediction
prediction[0]

In [None]:
# Make a prediction of the odds of a flight being delayed to San Francisco on a Thursday

# Create a dataframe with the input values
data = {'DayOfWeek': [4], 'DestAirportID': [14771]}
input_values = pd.DataFrame(data)

# Make a prediction
prediction = model.predict_proba(input_values)

# Show the prediction
prediction[0]

In [None]:
# Get unique column values for origin airport and id and export to CSV
origin_airport = df[['OriginAirportID', 'OriginAirportName']].drop_duplicates()
origin_airport.to_csv('origin_airport.csv', index=False)


In [None]:
# export the model to import later into Flask
import pickle

pickle.dump(model, open('server/model.pkl', 'wb'))