In [160]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [161]:
# Load dataset
df = pd.read_csv('./final-weather-flight-data/dataset_YUL-Flights-Weather.csv')

In [162]:
# Define a function to feature engineer, drop unwanted columns, and filter rows
def custom_preprocess_data(df):
    # Convert Scheduled Departure Time and Estimated Departure Time to datetime
    df['Scheduled Departure Time'] = pd.to_datetime(df['Scheduled Departure Time'])
    df['Estimated Departure Time'] = pd.to_datetime(df['Estimated Departure Time'])
    
    # Calculate the difference in minutes
    # df['Estimated Departure Delay (min)'] = (df['Estimated Departure Time'] - df['Scheduled Departure Time']).dt.total_seconds() / 60

    # Calculate the time of day
    df['Departure Time of Day'] = pd.cut(df['Scheduled Departure Time'].dt.hour, 
                                     bins=[0, 6, 12, 18, 24], 
                                     labels=['Night', 'Morning', 'Afternoon', 'Evening'], 
                                     right=False)

    # Weekday of departure
    df['Weekday of Departure'] = df['Scheduled Departure Time'].dt.day_name()

    # Calculate weather severety
    df['Weather Severity'] = np.where((df['Rain 1h'] > 0) | (df['Snow 1h'] > 0), 'Bad', 'Good')

    # Drop unwanted columns
    df = df.drop(columns=['Type', 'Departure IATA Code', 'Scheduled Departure Time', 'Estimated Departure Time', 
    'Actual Departure Time', 'Arrival Terminal', 'Scheduled Arrival Time', 'Estimated Arrival Time', 'Flight Number',
    'IATA Flight Number', 'Timestamp', 'Weather Description'])
    
    # Filter rows where 'Status' is not 'active'
    df = df[df['Status'] == 'active']
    
    # Drop the 'Status' column as it's no longer needed
    df = df.drop(columns=['Status'])
    
    return df

df = custom_preprocess_data(df)

In [163]:
# Define categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

In [164]:
# Preprocessing for numerical data with KNNImputer
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # Using 5 neighbors for imputation
    ('scaler', StandardScaler())])

In [165]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

In [166]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [167]:
# Define the model preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [168]:
# Apply the pipeline to the dataset
X_processed = pipeline.fit_transform(df)

In [169]:
# Convert the output back to a DataFrame

def get_feature_names_out(column_transformer):
    """Get output feature names for the given ColumnTransformer."""
    feature_names = []

    # Loop through each transformer within the ColumnTransformer
    for transformer_name, transformer, original_features in column_transformer.transformers_:
        if transformer_name == 'remainder':
            continue
        
        if hasattr(transformer, 'get_feature_names_out'):
            # If the transformer can generate feature names
            names = transformer.get_feature_names_out(original_features)
        else:
            # Otherwise, use the original feature names
            names = original_features
        
        feature_names.extend(names)
    
    return feature_names

def transform_output_to_df(X_processed, preprocessor, original_df):
    """Convert the output of the processing pipeline back to a pandas DataFrame."""
    feature_names = get_feature_names_out(preprocessor)
    processed_df = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed, 
                                columns=feature_names, 
                                index=original_df.index)
    return processed_df

processed_df = transform_output_to_df(X_processed, pipeline['preprocessor'], df)

processed_df.head()

Unnamed: 0,Departure Delay (min),Temperature,Feels Like,Pressure,Humidity,Wind Speed,Wind Degree,Clouds,Rain 1h,Snow 1h,Departure Gate,Arrival IATA Code,Airline Name,Weather Main,Departure Time of Day,Weekday of Departure,Weather Severity
0,-0.175553,-0.592618,-0.456049,0.639606,-1.823912,-0.665222,-1.503404,-1.81669,-0.198237,-0.178731,43.0,117.0,1.0,1.0,1.0,0.0,1.0
1,0.124795,-0.592618,-0.456049,0.639606,-1.823912,-0.665222,-1.503404,-1.81669,-0.198237,-0.178731,45.0,38.0,44.0,1.0,1.0,0.0,1.0
2,-0.866354,-0.592618,-0.456049,0.639606,-1.823912,-0.665222,-1.503404,-1.81669,-0.198237,-0.178731,38.0,5.0,10.0,1.0,1.0,0.0,1.0
3,2.587649,-0.592618,-0.456049,0.639606,-1.823912,-0.665222,-1.503404,-1.81669,-0.198237,-0.178731,30.0,261.0,33.0,1.0,1.0,0.0,1.0
4,0.03469,-0.592618,-0.456049,0.639606,-1.823912,-0.665222,-1.503404,-1.81669,-0.198237,-0.178731,41.0,261.0,95.0,1.0,1.0,0.0,1.0


In [171]:
# Save the processed dataset
processed_df.to_csv('./final-weather-flight-data/processed_dataset_YUL-Flights-Weather.csv', index=False)