In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [81]:
# Load dataset
df = pd.read_csv('./final-weather-flight-data/dataset_YUL-Flights-Weather.csv')

In [82]:
# Define a function to feature engineer, drop unwanted columns, and filter rows
def custom_preprocess_data(df):
    # Convert Scheduled Departure Time and Estimated Departure Time to datetime
    df['Scheduled Departure Time'] = pd.to_datetime(df['Scheduled Departure Time'])
    df['Estimated Departure Time'] = pd.to_datetime(df['Estimated Departure Time'])
    
    # Calculate the difference in minutes
    # df['Estimated Departure Delay (min)'] = (df['Estimated Departure Time'] - df['Scheduled Departure Time']).dt.total_seconds() / 60

    # Calculate the time of day
    df['Departure Time of Day'] = pd.cut(df['Scheduled Departure Time'].dt.hour, 
                                     bins=[0, 6, 12, 18, 24], 
                                     labels=['Night', 'Morning', 'Afternoon', 'Evening'], 
                                     right=False)

    # Weekday of departure
    df['Weekday of Departure'] = df['Scheduled Departure Time'].dt.day_name()

    # Calculate weather severety
    df['Weather Severity'] = np.where((df['Rain 1h'] > 0) | (df['Snow 1h'] > 0), 'Bad', 'Good')

    # Filter out detinations with a frequency less than 100
    destintaiton_counts = df['Arrival IATA Code'].value_counts()
    destinations_to_keep = destintaiton_counts[destintaiton_counts >= 100].index
    df = df[df['Arrival IATA Code'].isin(destinations_to_keep)]

    # Filter out infrequent airlines
    # airline_counts = df['Airline Name'].value_counts()
    # airlines_to_keep = airline_counts[airline_counts >= 50].index
    # df = df[df['Airline Name'].isin(airlines_to_keep)]

    # Feature engineering: Create a feature for delay status
    # df['Delay Status'] = pd.cut(df['Departure Delay (min)'], 
    #                             bins=[-np.inf, 0, 15, 60, np.inf], 
    #                             labels=['On Time', 'Slight Delay', 'Moderate Delay', 'Severe Delay'])

    # Feature engineering: Create a feature for season based on month
    df['Season'] = pd.cut(df['Scheduled Departure Time'].dt.month, 
                          bins=[0, 3, 6, 9, 12], 
                          labels=['Winter', 'Spring', 'Summer', 'Fall'], 
                          right=False)

    # Feature engineering: Create a binary feature for weekend departure
    df['Weekend Departure'] = df['Weekday of Departure'].isin(['Saturday', 'Sunday']).astype(int)

    # Feature engineering: Create a feature for visibility based on weather conditions
    df['Visibility'] = np.where((df['Weather Main'].isin(['Fog', 'Mist', 'Haze', 'Snow', 'Rain'])), 'Low', 'High')

    # # Convert Scheduled Arrival Time and Scheduled Departure Time to datetime before calculating duration
    # df['Scheduled Arrival Time'] = pd.to_datetime(df['Scheduled Arrival Time'])
    # df['Scheduled Departure Time'] = pd.to_datetime(df['Scheduled Departure Time'])
    # df['Flight Duration (min)'] = (df['Scheduled Arrival Time'] - df['Scheduled Departure Time']).dt.total_seconds() / 60

    # Drop unwanted columns
    df = df.drop(columns=['Type', 'Departure IATA Code', 'Scheduled Departure Time', 'Estimated Departure Time', 
    'Actual Departure Time', 'Arrival Terminal', 'Scheduled Arrival Time', 'Estimated Arrival Time', 'Flight Number',
    'IATA Flight Number', 'Timestamp', 'Weather Description'])
    
    # Filter rows where 'Status' is not 'active'
    df = df[df['Status'] == 'active']
    
    # Drop the 'Status' column as it's no longer needed
    df = df.drop(columns=['Status'])
    
    return df

df = custom_preprocess_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Season'] = pd.cut(df['Scheduled Departure Time'].dt.month,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Weekend Departure'] = df['Weekday of Departure'].isin(['Saturday', 'Sunday']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Visibility'] = np.where((df['Weather Main'].

In [83]:
# Define categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

In [84]:
# Preprocessing for numerical data with KNNImputer
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # Using 5 neighbors for imputation
    ('scaler', StandardScaler())])

In [85]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

In [86]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [87]:
# Define the model preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [88]:
# Apply the pipeline to the dataset
X_processed = pipeline.fit_transform(df)

In [89]:
# Convert the output back to a DataFrame

def get_feature_names_out(column_transformer):
    """Get output feature names for the given ColumnTransformer."""
    feature_names = []

    # Loop through each transformer within the ColumnTransformer
    for transformer_name, transformer, original_features in column_transformer.transformers_:
        if transformer_name == 'remainder':
            continue
        
        if hasattr(transformer, 'get_feature_names_out'):
            # If the transformer can generate feature names
            names = transformer.get_feature_names_out(original_features)
        else:
            # Otherwise, use the original feature names
            names = original_features
        
        feature_names.extend(names)
    
    return feature_names

def transform_output_to_df(X_processed, preprocessor, original_df):
    """Convert the output of the processing pipeline back to a pandas DataFrame."""
    feature_names = get_feature_names_out(preprocessor)
    processed_df = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed, 
                                columns=feature_names, 
                                index=original_df.index)
    return processed_df

processed_df = transform_output_to_df(X_processed, pipeline['preprocessor'], df)

processed_df.head()

Unnamed: 0,Departure Delay (min),Temperature,Feels Like,Pressure,Humidity,Wind Speed,Wind Degree,Clouds,Rain 1h,Snow 1h,Departure Gate,Arrival IATA Code,Airline Name,Weather Main,Departure Time of Day,Weekday of Departure,Weather Severity,Season,Visibility
0,-0.177559,-0.589572,-0.451384,0.638452,-1.848781,-0.665376,-1.498219,-1.818275,-0.197941,-0.179089,41.0,24.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
1,0.12038,-0.589572,-0.451384,0.638452,-1.848781,-0.665376,-1.498219,-1.818275,-0.197941,-0.179089,43.0,4.0,31.0,1.0,1.0,0.0,1.0,1.0,0.0
3,2.563484,-0.589572,-0.451384,0.638452,-1.848781,-0.665376,-1.498219,-1.818275,-0.197941,-0.179089,28.0,65.0,23.0,1.0,1.0,0.0,1.0,1.0,0.0
4,0.030999,-0.589572,-0.451384,0.638452,-1.848781,-0.665376,-1.498219,-1.818275,-0.197941,-0.179089,39.0,65.0,72.0,1.0,1.0,0.0,1.0,1.0,0.0
5,-0.237147,-0.589572,-0.451384,0.638452,-1.848781,-0.665376,-1.498219,-1.818275,-0.197941,-0.179089,69.0,21.0,4.0,1.0,1.0,0.0,1.0,1.0,0.0


In [90]:
# Save the processed dataset
processed_df.to_csv('./final-weather-flight-data/processed_dataset_YUL-Flights-Weather.csv', index=False)