In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [19]:
# Load dataset
df = pd.read_csv('final-weather-flight-data/full_dataset_YUL-Flights-Weather.csv')

In [20]:
df.columns

Index(['Status', 'Departure Gate', 'Departure Delay (min)',
       'Scheduled Departure Time', 'Arrival IATA Code', 'Airline Name',
       'Temperature', 'Feels Like', 'Pressure', 'Humidity', 'Wind Speed',
       'Wind Degree', 'Clouds', 'Weather Main', 'Rain 1h', 'Snow 1h'],
      dtype='object')

In [21]:
# Define a function to feature engineer, drop unwanted columns, and filter rows
def custom_preprocess_data(df):
    # Convert Scheduled Departure Time and Estimated Departure Time to datetime
    df['Scheduled Departure Time'] = pd.to_datetime(df['Scheduled Departure Time'])
    #df['Estimated Departure Time'] = pd.to_datetime(df['Estimated Departure Time'])
    
    # Calculate the difference in minutes
    # df['Estimated Departure Delay (min)'] = (df['Estimated Departure Time'] - df['Scheduled Departure Time']).dt.total_seconds() / 60

    # Calculate the time of day
    df['Departure Time of Day'] = pd.cut(df['Scheduled Departure Time'].dt.hour, 
                                     bins=[0, 6, 12, 18, 24], 
                                     labels=['Night', 'Morning', 'Afternoon', 'Evening'], 
                                     right=False)

    # Weekday of departure
    df['Weekday of Departure'] = df['Scheduled Departure Time'].dt.day_name()

    # Calculate weather severety
    df['Weather Severity'] = np.where((df['Rain 1h'] > 0) | (df['Snow 1h'] > 0), 'Bad', 'Good')

    # Filter out detinations with a frequency less than 100
    destintaiton_counts = df['Arrival IATA Code'].value_counts()
    destinations_to_keep = destintaiton_counts[destintaiton_counts >= 100].index
    df = df[df['Arrival IATA Code'].isin(destinations_to_keep)]

    # Filter out infrequent airlines
    # airline_counts = df['Airline Name'].value_counts()
    # airlines_to_keep = airline_counts[airline_counts >= 50].index
    # df = df[df['Airline Name'].isin(airlines_to_keep)]

    # Feature engineering: Create a feature for delay status
    # df['Delay Status'] = pd.cut(df['Departure Delay (min)'], 
    #                             bins=[-np.inf, 0, 15, 60, np.inf], 
    #                             labels=['On Time', 'Slight Delay', 'Moderate Delay', 'Severe Delay'])

    # Feature engineering: Create a feature for season based on month
    df['Season'] = pd.cut(df['Scheduled Departure Time'].dt.month, 
                          bins=[0, 3, 6, 9, 12], 
                          labels=['Winter', 'Spring', 'Summer', 'Fall'], 
                          right=False)

    # Feature engineering: Create a binary feature for weekend departure
    df['Weekend Departure'] = df['Weekday of Departure'].isin(['Saturday', 'Sunday']).astype(int)

    # Feature engineering: Create a feature for visibility based on weather conditions
    df['Visibility'] = np.where((df['Weather Main'].isin(['Fog', 'Mist', 'Haze', 'Snow', 'Rain'])), 'Low', 'High')

    # # Convert Scheduled Arrival Time and Scheduled Departure Time to datetime before calculating duration
    # df['Scheduled Arrival Time'] = pd.to_datetime(df['Scheduled Arrival Time'])
    # df['Scheduled Departure Time'] = pd.to_datetime(df['Scheduled Departure Time'])
    # df['Flight Duration (min)'] = (df['Scheduled Arrival Time'] - df['Scheduled Departure Time']).dt.total_seconds() / 60

    # Drop unwanted columns
    # df = df.drop(columns=['Type', 'Departure IATA Code', 'Scheduled Departure Time', 'Estimated Departure Time', 
    # 'Actual Departure Time', 'Arrival Terminal', 'Scheduled Arrival Time', 'Estimated Arrival Time', 'Flight Number',
    # 'IATA Flight Number', 'Timestamp', 'Weather Description'])
    
    # Filter rows where 'Status' is not 'active'
    df = df[df['Status'] == 'active']
    
    # Drop the 'Status' column as it's no longer needed
    df = df.drop(columns=['Status'])
    
    return df

df = custom_preprocess_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Season'] = pd.cut(df['Scheduled Departure Time'].dt.month,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Weekend Departure'] = df['Weekday of Departure'].isin(['Saturday', 'Sunday']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Visibility'] = np.where((df['Weather Main'].

In [22]:
df.columns

Index(['Departure Gate', 'Departure Delay (min)', 'Scheduled Departure Time',
       'Arrival IATA Code', 'Airline Name', 'Temperature', 'Feels Like',
       'Pressure', 'Humidity', 'Wind Speed', 'Wind Degree', 'Clouds',
       'Weather Main', 'Rain 1h', 'Snow 1h', 'Departure Time of Day',
       'Weekday of Departure', 'Weather Severity', 'Season',
       'Weekend Departure', 'Visibility'],
      dtype='object')

In [30]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.utils import shuffle

class DateTimeExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, features=None):
        self.features = features if features else ['year', 'month', 'day', 'hour', 'weekday']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_dt = pd.to_datetime(X.iloc[:, 0])
        df = pd.DataFrame()
        if 'year' in self.features:
            df['year'] = X_dt.dt.year
        if 'month' in self.features:
            df['month'] = X_dt.dt.month
        if 'day' in self.features:
            df['day'] = X_dt.dt.day
        if 'hour' in self.features:
            df['hour'] = X_dt.dt.hour
        if 'weekday' in self.features:
            df['weekday'] = X_dt.dt.weekday
        return df

    # Let's make sure to handle feature naming properly for custom transformer
    def get_feature_names_out(self, input_features):
        return self.transform(pd.DataFrame(columns=input_features)).columns.tolist()

# Setup your column transformers and pipelines
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

numerical_transformer = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

date_transformer = DateTimeExtractor()

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols),
    ('date', date_transformer, ['Scheduled Departure Time'])
])

# Pipeline definition
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Function to extract feature names across transformers
def get_feature_names(column_transformer):
    feature_names = []
    
    # Loop through all transformers
    for name, transformer, columns in column_transformer.transformers_[:-1]:  # last one is remainder
        if hasattr(transformer, 'get_feature_names_out'):
            transformer_feature_names = transformer.get_feature_names_out(columns)
        else:
            transformer_feature_names = columns if isinstance(columns, (np.ndarray, list)) else [columns]
        feature_names.extend(transformer_feature_names)
    
    return feature_names

# Apply the pipeline and extract the DataFrame
X_processed = pipeline.fit_transform(df)
feature_names = get_feature_names(preprocessor)
processed_df = pd.DataFrame(X_processed, columns=feature_names, index=df.index)

# Assign weights
processed_df = shuffle(processed_df, random_state=42)
processed_df.sort_values('Scheduled Departure Time', ascending=True, inplace=True)  # Adjust column name as needed
weights = np.linspace(start=0.1, stop=1.0, num=len(processed_df))
processed_df['weights'] = weights

print(processed_df.head())


KeyError: 'Scheduled Departure Time'

In [38]:
# Save the processed dataset
processed_df.to_csv('final-weather-flight-data/full_processed_dataset_YUL-Flights-Weather.csv', index=False)