# Preprocess data


In [12]:
!pip install category_encoders

from google.colab import drive
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
import os



In [2]:
drive.mount('/content/drive')

file_path ='/content/drive/MyDrive/Flights/df_preprocessed.csv'

df = pd.read_csv(file_path)

df.head()

Mounted at /content/drive


Unnamed: 0,year,quarter,month,dayofmonth,dayofweek,flightdate,tail_number,origin,origincityname,dest,destcityname,distance,is_delayed
0,2018,2,4,6,5,2018-04-06,n13161,ewr,"newark, nj",ind,"indianapolis, in",645.0,1
1,2018,2,4,6,5,2018-04-06,n273jb,ord,"chicago, il",jfk,"new york, ny",740.0,0
2,2018,2,4,13,5,2018-04-13,n258nn,ord,"chicago, il",cle,"cleveland, oh",316.0,0
3,2018,2,4,13,5,2018-04-13,n809ua,iah,"houston, tx",dfw,"dallas/fort worth, tx",224.0,1
4,2018,2,4,19,4,2018-04-19,n866as,sfo,"san francisco, ca",ont,"ontario, ca",363.0,0


In [4]:
def engineer_flight_features(df):
    """
    Modular feature engineering for flight delay prediction.
    """
    # 1. Route Features
    # We create the route but keep codes for Target Encoding later
    df['airport_route'] = df['origin'] + ' -> ' + df['dest']

    # 2. Base Temporal Features
    df['flightdate'] = pd.to_datetime(df['flightdate'])
    df['is_weekend'] = df['flightdate'].dt.dayofweek.isin([5, 6]).astype(int)
    df['week_of_year'] = df['flightdate'].dt.isocalendar().week.astype(int)

    # 3. Holiday Logic
    def get_us_holidays(years):
        holidays = []
        for year in years:
            # Fixed Dates
            holidays.extend([pd.Timestamp(year, m, d) for m, d in [(1,1), (7,4), (11,11), (12,25)]])
            # Floating Mondays
            holidays.append(pd.date_range(f'{year}-01-01', periods=3, freq='W-MON')[-1]) # MLK
            holidays.append(pd.date_range(f'{year}-02-01', periods=3, freq='W-MON')[-1]) # Presidents
            holidays.append(pd.date_range(f'{year}-05-01', f'{year}-05-31', freq='W-MON')[-1]) # Memorial
            holidays.append(pd.date_range(f'{year}-09-01', periods=1, freq='W-MON')[0])  # Labor
            holidays.append(pd.date_range(f'{year}-10-01', periods=2, freq='W-MON')[-1]) # Columbus
            # Thanksgiving (4th Thursday)
            holidays.append(pd.date_range(f'{year}-11-01', periods=4, freq='W-THU')[-1])
        return set([h.date() for h in holidays])

    all_holidays = get_us_holidays(df['year'].unique())
    df_dates = df['flightdate'].dt.date

    df['is_holiday'] = df_dates.isin(all_holidays).astype(int)
    df['days_to_holiday'] = df_dates.apply(lambda x: min([abs((h - x).days) for h in all_holidays]))
    df['is_near_holiday'] = (df['days_to_holiday'] <= 3).astype(int)
    df['is_day_before_holiday'] = df_dates.apply(lambda x: (x + timedelta(days=1)) in all_holidays).astype(int)
    df['is_day_after_holiday'] = df_dates.apply(lambda x: (x - timedelta(days=1)) in all_holidays).astype(int)

    # 4. Seasonal Features
    df['is_summer_travel'] = df['month'].isin([6, 7, 8]).astype(int)
    df['is_winter_travel'] = df['month'].isin([12, 1]).astype(int)
    df['is_spring_break'] = df['month'].isin([3, 4]).astype(int)
    df['is_thanksgiving_week'] = ((df['month'] == 11) & (df['week_of_year'].isin([47, 48]))).astype(int)
    df['is_christmas_week'] = ((df['month'] == 12) & (df['dayofmonth'] >= 20)).astype(int)

    # 5. Cyclical Encoding
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)

    # 6. Distance & Spatial Features
    df['distance_category'] = pd.cut(df['distance'], bins=[0, 500, 1500, 3000, 10000], labels=['short', 'medium', 'long', 'ultra_long'])
    df['is_transcontinental'] = (df['distance'] > 2000).astype(int)

    # 7. Final Cleanup: Dropping raw time/string columns to prevent leakage or high cardinality
    cols_to_drop = [
        'flightdate', 'year', 'quarter', 'month', 'dayofmonth', 'dayofweek',
        'origincityname', 'destcityname', 'flight_route', 'days_to_holiday'
    ]
    df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True)

    return df

# Execute
df = engineer_flight_features(df)
df.head()

Unnamed: 0,tail_number,origin,dest,distance,is_delayed,airport_route,is_weekend,week_of_year,is_holiday,is_near_holiday,...,is_winter_travel,is_spring_break,is_thanksgiving_week,is_christmas_week,month_sin,month_cos,dayofweek_sin,dayofweek_cos,distance_category,is_transcontinental
0,n13161,ewr,ind,645.0,1,ewr -> ind,0,14,0,0,...,0,1,0,0,0.866025,-0.5,-0.974928,-0.222521,medium,0
1,n273jb,ord,jfk,740.0,0,ord -> jfk,0,14,0,0,...,0,1,0,0,0.866025,-0.5,-0.974928,-0.222521,medium,0
2,n258nn,ord,cle,316.0,0,ord -> cle,0,15,0,0,...,0,1,0,0,0.866025,-0.5,-0.974928,-0.222521,short,0
3,n809ua,iah,dfw,224.0,1,iah -> dfw,0,15,0,0,...,0,1,0,0,0.866025,-0.5,-0.974928,-0.222521,short,0
4,n866as,sfo,ont,363.0,0,sfo -> ont,0,16,0,0,...,0,1,0,0,0.866025,-0.5,-0.433884,-0.900969,short,0


In [9]:
# 1. Define Feature Groups
target = 'is_delayed'
high_cardinality_cols = ['tail_number', 'origin', 'dest', 'airport_route']
one_hot_cols = ['distance_category']
numerical_cols = [
    'distance', 'week_of_year', 'month_sin', 'month_cos',
    'dayofweek_sin', 'dayofweek_cos'
]

# 2. Split Data FIRST
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Target Encoding (High Cardinality)
te = ce.TargetEncoder(cols=high_cardinality_cols)
X_train = te.fit_transform(X_train, y_train)
X_test = te.transform(X_test)

# 4. One-Hot Encoding (Low Cardinality)
X_train = pd.get_dummies(X_train, columns=one_hot_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=one_hot_cols, drop_first=True)

# 5. Scaling Numerical Features
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print(f"Final training shape: {X_train.shape}")

Final training shape: (447374, 24)


In [13]:
save_path = '/content/drive/My Drive/flight_project'
if not os.path.exists(save_path):
    os.makedirs(save_path)

file_name = f"{save_path}/df_final_processed.parquet"

df.to_parquet(file_name, index=False)

print(f"File successfully saved to: {file_name}")

File successfully saved to: /content/drive/My Drive/flight_project/df_final_processed.parquet
