In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Load dataset
data = pd.read_csv("../data/Jan_2020_ontime.csv")

# Basic data exploration
print("Dataset shape:", data.shape)
print("\nFirst 5 rows:")
print(data.head())

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

# Simple visualization of target variable
plt.figure(figsize=(8, 6))
sns.countplot(x='DEP_DEL15', data=data)
plt.title('Flight Delay Distribution')
plt.xlabel('Delayed 15+ minutes (1=Yes, 0=No)')
plt.ylabel('Count')
plt.show()

In [11]:
# Data preprocessing - drop rows with missing values in key columns
data = data.dropna(subset=['DEP_TIME', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'DEP_DEL15'])

# Select a small set of features that would be known before flight
features = [
    'DAY_OF_WEEK',      # Day of the week (1=Monday, 7=Sunday)
    'OP_UNIQUE_CARRIER', # Airline carrier code
    'DEP_TIME'          # Departure time
]

# Create a simple derived feature - departure hour
data['DEP_HOUR'] = data['DEP_TIME'].apply(lambda x: int(x/100))
features.append('DEP_HOUR')

# Prepare features and target
X = data[features]
y = data['DEP_DEL15']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numerical and categorical features
numerical_features = ['DEP_TIME', 'DEP_HOUR', 'DAY_OF_WEEK']
categorical_features = ['OP_UNIQUE_CARRIER']

In [None]:
# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a pipeline with preprocessing and model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Simple visualization of results
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()