# Preprocess data


In [None]:
!pip install category_encoders

from google.colab import drive
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
import os
import joblib

In [None]:
drive.mount('/content/drive')

file_path ='/content/drive/MyDrive/Flights/df_preprocessed.csv'

df = pd.read_csv(file_path)

df.head()

In [None]:
df.columns

In [None]:
df.info()


In [None]:
def engineer_flight_features(df):
    """
    Standardized Feature Engineering.
    Returns the transformed DataFrame AND the all_holidays set.
    """
    # 1. Temporal Extraction
    df['dep_hour'] = df['crsdeptime'].astype(int) // 100

    # 2. Route Features (High Cardinality)
    df['airport_route'] = df['origin'].astype(str) + ' -> ' + df['dest'].astype(str)

    # 3. Calendar Logic
    df['flightdate'] = pd.to_datetime(df['flightdate'])
    df['is_weekend'] = df['flightdate'].dt.dayofweek.isin([5, 6]).astype(int)
    df['week_of_year'] = df['flightdate'].dt.isocalendar().week.astype(int)

    # 4. Holiday Logic
    def get_us_holidays(years):
        holidays = []
        for year in years:
            holidays.extend([pd.Timestamp(year, m, d) for m, d in [(1,1), (7,4), (11,11), (12,25)]])
            holidays.append(pd.date_range(f'{year}-01-01', periods=3, freq='W-MON')[-1])
            holidays.append(pd.date_range(f'{year}-02-01', periods=3, freq='W-MON')[-1])
            holidays.append(pd.date_range(f'{year}-05-01', f'{year}-05-31', freq='W-MON')[-1])
            holidays.append(pd.date_range(f'{year}-09-01', periods=1, freq='W-MON')[0])
            holidays.append(pd.date_range(f'{year}-10-01', periods=2, freq='W-MON')[-1])
            holidays.append(pd.date_range(f'{year}-11-01', periods=4, freq='W-THU')[-1])
        return set([h.date() for h in holidays])

    # Capture the unique set of holiday dates
    all_holidays = get_us_holidays(df['year'].unique())
    df_dates = df['flightdate'].dt.date
    df['is_holiday'] = df_dates.isin(all_holidays).astype(int)
    df['is_near_holiday'] = (df_dates.apply(lambda x: min([abs((h - x).days) for h in all_holidays])) <= 3).astype(int)

    # 5. Cyclical Encoding
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    df['hour_sin'] = np.sin(2 * np.pi * df['dep_hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['dep_hour'] / 24)

    # 6. Distance Categories
    df['distance_category'] = pd.cut(df['distance'],
                                     bins=[0, 500, 1500, 3000, 10000],
                                     labels=['short', 'medium', 'long', 'ultra_long'])

    # 7. Strategic Pruning
    keep_for_encoding = ['tail_number', 'origin', 'dest', 'airport_route', 'deptimeblk', 'operating_airline']
    drop_list = ['flightdate', 'year', 'quarter', 'month', 'dayofmonth', 'dayofweek',
                 'origincityname', 'destcityname', 'crsdeptime']

    actual_drop = [c for c in drop_list if c in df.columns and c not in keep_for_encoding]
    df.drop(columns=actual_drop, inplace=True)

    # RETURN BOTH: The DataFrame and the Holiday Set
    return df, all_holidays

# Execute engineering and "catch" both variables
df, holiday_set = engineer_flight_features(df)

In [None]:
# 1. Feature Definition
target = 'is_delayed'
high_card_cols = ['tail_number', 'origin', 'dest', 'airport_route', 'deptimeblk', 'operating_airline']
ohe_cols = ['distance_category']
num_cols = [
    'distance', 'week_of_year', 'month_sin', 'month_cos',
    'dayofweek_sin', 'dayofweek_cos', 'hour_sin', 'hour_cos',
    'is_weekend', 'is_holiday', 'is_near_holiday', 'dep_hour'
]

# 2. Train/Test Split
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Target Encoding (M-Estimate for high-cardinality stability)
encoder = ce.TargetEncoder(cols=high_card_cols, smoothing=5.0)
X_train = encoder.fit_transform(X_train, y_train)
X_test = encoder.transform(X_test)

# 4. Dummy Variables & Feature Scaling
X_train = pd.get_dummies(X_train, columns=ohe_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=ohe_cols, drop_first=True)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

print(f"Feature transformation successful. Train shape: {X_train.shape}")

In [None]:
save_path = '/content/drive/My Drive/Flights'
if not os.path.exists(save_path):
    os.makedirs(save_path)

file_name = f"{save_path}/df_final_processed.parquet"

df.to_parquet(file_name, index=False)

print(f"File successfully saved to: {file_name}")

In [None]:
save_path = '/content/drive/My Drive/Flights'

# Save features and targets separately
X_train.to_parquet(f"{save_path}/X_train.parquet")
X_test.to_parquet(f"{save_path}/X_test.parquet")

# Targets are Series, so we convert to DataFrame to save as parquet
y_train.to_frame().to_parquet(f"{save_path}/y_train.parquet")
y_test.to_frame().to_parquet(f"{save_path}/y_test.parquet")

print("Train/Test sets saved independently. Ready for the Modeling notebook.")

In [None]:
# 1. Set Path
drive_path = '/content/drive/My Drive/Flights/'

# 2. Save FE Artifacts
joblib.dump(encoder, os.path.join(drive_path, 'flight_risk_encodings.pkl'))
joblib.dump(scaler, os.path.join(drive_path, 'flight_risk_scaler.pkl'))
joblib.dump(list(holiday_set), os.path.join(drive_path, 'flight_holidays.pkl'))

print(f"Feature Engineering artifacts successfully saved to: {drive_path}")