In [None]:
import pandas as pd
import zipfile
import io
import glob

# Get all zip files in the current directory
zip_files = glob.glob("data/*.zip")

# Create an empty list to store individual dataframes
all_dfs = []

# Loop through each zip file
for zip_file in zip_files:
    # Open the zip file
    with zipfile.ZipFile(zip_file, "r") as z:
        # Get the csv file from the zip (assuming one csv per zip)
        csv_file = [f for f in z.namelist() if f.endswith(".csv")][0]

        # Read the csv directly from the zip file
        with z.open(csv_file) as f:
            # Read into pandas DataFrame
            df = pd.read_csv(io.BytesIO(f.read()))

            # Optional: Add source information
            df["source_zip"] = zip_file

            # Append to our list
            all_dfs.append(df)

# Combine all dataframes into one
combined_df = pd.concat(all_dfs, ignore_index=True)
combined_df = combined_df[combined_df["ARR_DELAY"].notna()]
combined_df = combined_df[combined_df["DEST_AIRPORT_ID"] == 14747]
combined_df.drop(
    columns=[
        "DEST_AIRPORT_ID",
        "CANCELLATION_CODE",
        "source_zip",
        "CANCELLED",
        "DEST_AIRPORT_SEQ_ID",
        "DEST_CITY_MARKET_ID",
    ],
    inplace=True,
)

# Assuming your DataFrame is called 'df'
combined_df["Date"] = pd.to_datetime(
    dict(
        year=combined_df["YEAR"],
        month=combined_df["MONTH"],
        day=combined_df["DAY_OF_MONTH"],
    )
).dt.strftime("%m/%d/%Y")
combined_df["Date"] = pd.to_datetime(combined_df["Date"], format="%m/%d/%Y")

# Now combined_df contains all the data
print(
    f"Combined DataFrame has {len(combined_df)} rows and {len(combined_df.columns)} columns"
)

In [None]:
climate_df = pd.read_csv("data/Climate.csv")

climate_df.drop(columns=["Snow Depth", "New Snow"], inplace=True)

climate_df["Date"] = pd.to_datetime(climate_df["Date"], format="%m/%d/%Y")

In [None]:
merged_df = pd.merge(combined_df, climate_df, on="Date", how="left")

merged_df.drop(columns=["Date"], inplace=True)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the distribution of arrival delays
plt.figure(figsize=(10, 6))
sns.histplot(merged_df["ARR_DELAY"], bins=50, kde=True)
plt.title("Arrival Delay Distribution")
plt.xlabel("Arrival Delay (minutes)")
plt.ylabel("Number of Flights")
plt.show()

print("Min:", merged_df["ARR_DELAY"].min())
print("Max:", merged_df["ARR_DELAY"].max())
print("Range:", merged_df["ARR_DELAY"].max() - merged_df["ARR_DELAY"].min())
print("Mode:", merged_df["ARR_DELAY"].mode()[0])
print("Average (mean):", merged_df["ARR_DELAY"].mean())
print("Median:", merged_df["ARR_DELAY"].median())
print("Quantiles:")
print(merged_df["ARR_DELAY"].quantile([0.25, 0.5, 0.75]))

In [None]:
# Calculate average delay by month
monthly_delay = merged_df.groupby("MONTH")["ARR_DELAY"].mean()

# Plot average delay by month
plt.figure(figsize=(10, 6))
sns.barplot(x=monthly_delay.index, y=monthly_delay.values, palette="Blues_d")
plt.title("Average Arrival Delay by Month")
plt.xlabel("Month")
plt.ylabel("Average Arrival Delay (minutes)")
plt.xticks(range(0, 12), [str(i + 1) for i in range(12)])
plt.show()

In [None]:
# Calculate correlation between ARR_DELAY and Temperature
correlation = merged_df["ARR_DELAY"].corr(merged_df["Temp Avg"].astype(float))
print("Correlation between arrival delay and temperature:", correlation)

# Scatter plot to visualize the relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(x=merged_df["Temp Avg"], y=merged_df["ARR_DELAY"], alpha=0.3)
plt.title("Arrival Delay vs Temperature")
plt.xlabel("Temperature")
plt.ylabel("Arrival Delay (minutes)")
plt.show()

In [None]:
# Calculate correlation between average temperature and arrival delay
corr = merged_df["ARR_DELAY"].corr(merged_df["Temp Avg"].astype(float))
print(f"Correlation coefficient between Temp Avg and ARR_DELAY: {corr:.3f}")

# Group by temperature bins and calculate average delay
merged_df["TempBin"] = pd.cut(merged_df["Temp Avg"], bins=20)
temp_delay = merged_df.groupby("TempBin")["ARR_DELAY"].mean()

print("\nAverage ARR_DELAY by temperature bin:")
print(temp_delay)

# Optional: Visualize
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
temp_delay.plot(kind="line", marker="o")
plt.title("Average Arrival Delay by Temperature Bin")
plt.xlabel("Temperature Bin")
plt.ylabel("Average Arrival Delay (minutes)")
plt.show()

In [None]:
merged_df["Precipitation"] = (
    merged_df["Precipitation"].replace("T", 0.005).astype(float)
)

In [None]:
# Calculate correlation between ARR_DELAY and Precipitation
correlation = merged_df["ARR_DELAY"].corr(merged_df["Precipitation"])
print("Correlation between arrival delay and precipitation:", correlation)

# Scatter plot to visualize the relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(x=merged_df["Precipitation"], y=merged_df["ARR_DELAY"], alpha=0.3)
plt.title("Arrival Delay vs Precipitation")
plt.xlabel("Precipitation (inches)")
plt.ylabel("Arrival Delay (minutes)")
plt.show()

In [None]:
# Calculate correlation between precipitation and arrival delay
corr = merged_df["ARR_DELAY"].corr(merged_df["Precipitation"])
print(f"Correlation coefficient between Precipitation and ARR_DELAY: {corr:.3f}")

# Group by precipitation bins and calculate average delay
merged_df["PrecipBin"] = pd.cut(merged_df["Precipitation"], bins=20)
precip_delay = merged_df.groupby("PrecipBin")["ARR_DELAY"].mean()

print("\nAverage ARR_DELAY by precipitation bin:")
print(precip_delay)


plt.figure(figsize=(10, 6))
precip_delay.plot(kind="line", marker="o")
plt.title("Average Arrival Delay by Precipitation Bin")
plt.xlabel("Precipitation Bin (inches)")
plt.ylabel("Average Arrival Delay (minutes)")
plt.grid(True)
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score

# Select features and target
features = [
    "Temp Max",
    "Temp Min",
    "Temp Avg",
    "Departure",
    "HDD",
    "CDD",
    "Precipitation",
]
X = merged_df[features]
y = merged_df["ARR_DELAY"]

# Handle any missing values (optional, depending on your data)
X = X.fillna(0)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Random Forest
rf = LinearRegression()
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf.predict(X_test)
print("RMSE:", root_mean_squared_error(y_test, y_pred))
print("R^2:", r2_score(y_test, y_pred))

In [None]:
# Define rainy days as days with precipitation > 0
merged_df["Rain"] = merged_df["Precipitation"] > 0

# Calculate average delay for rainy and non-rainy days
rain_group = merged_df.groupby("Rain")["ARR_DELAY"].agg(
    ["mean", "count", "median", "std"]
)
print(rain_group)

# Optional: Visualize
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6, 4))
sns.boxplot(x="Rain", y="ARR_DELAY", data=merged_df)
plt.xticks([0, 1], ["No Rain", "Rain"])
plt.title("Arrival Delay: Rain vs No Rain")
plt.xlabel("Rain")
plt.ylabel("Arrival Delay (minutes)")
plt.show()

In [None]:
# Create a new column for delay category
def delay_category(delay):
    if delay < 0:
        return "will early"
    elif delay == 0:
        return "no delay"
    elif delay <= 15:
        return "will be a bit late"
    else:
        return "will be late"


merged_df["DELAY_CLASS"] = merged_df["ARR_DELAY"].apply(delay_category)

# Encode the categories as numbers for classification
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_class = le.fit_transform(merged_df["DELAY_CLASS"])

# Features
features = [
    "Temp Max",
    "Temp Min",
    "Temp Avg",
    "Departure",
    "HDD",
    "CDD",
    "Precipitation",
]
X = merged_df[features]

# Train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_class, test_size=0.2, random_state=42
)

# Train Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("RMSE:", root_mean_squared_error(y_test, y_pred))