# utils.py

In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.svm import SVR
import xgboost as xgb
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from geopy.geocoders import Nominatim
from folium import plugins
from geopy.distance import geodesic
import streamlit as st
import joblib
import datetime
import pipeline
import datetime
import mlflow
from mlflow.tracking import MlflowClient
from math import sqrt


def load_data():
    """
    Load dataset and set project filters

    """

    df = pd.read_csv('dataset_YUL-Flights-Weather.csv')
    df = df.drop_duplicates()

    # Filter out destinations with a frequency less than 100
    destination_counts = df['Arrival IATA Code'].value_counts()
    destinations_to_keep = destination_counts[destination_counts >= 100].index
    df = df[df['Arrival IATA Code'].isin(destinations_to_keep)]

    # make arrival IATA Code uppercase
    df['Arrival IATA Code'] = df['Arrival IATA Code'].str.upper()
    df['Airline Name'] = df['Airline Name'].str.upper()

    # Filter out airlines with a frequency less than 100
    airline_counts = df['Airline Name'].value_counts()
    airlines_to_keep = airline_counts[airline_counts >= 100].index
    df = df[df['Airline Name'].isin(airlines_to_keep)]

    
    # Filter rows where 'Status' is not 'active'
    df = df[df['Status'] == 'active']
    
    # Drop the 'Status' column as it's no longer needed
    df = df.drop(columns=['Status','Departure Gate','IATA Flight Number'])

    return df


def preprocess_dates(df):
    """
    Engineering variables from date columns
    
    """

    # Convert Scheduled Departure Time and Estimated Departure Time to datetime
    df['Scheduled Departure Time'] = pd.to_datetime(df['Scheduled Departure Time'])
    df['Estimated Departure Time'] = pd.to_datetime(df['Estimated Departure Time'])
    
    # Calculate the difference in minutes
    df['Estimated Departure Delay (min)'] = (df['Estimated Departure Time'] - df['Scheduled Departure Time']).dt.total_seconds() / 60

    # Calculate the time of day
    df['Departure Time of Day'] = pd.cut(df['Scheduled Departure Time'].dt.hour, 
                                     bins=[0, 6, 12, 18, 24], 
                                     labels=['Night', 'Morning', 'Afternoon', 'Evening'], 
                                     right=False)

    # Weekday of departure
    df['Weekday of Departure'] = df['Scheduled Departure Time'].dt.day_name()

    # Feature engineering: Create a binary feature for weekend departure
    df['Weekend Departure'] = df['Weekday of Departure'].isin(['Saturday', 'Sunday']).astype(int)


    #### experimental
    df['Airline Delay Rate'] = df.groupby('Airline Name')['Departure Delay (min)'].transform('mean')
    # Calculate the overall delay rate of destination
    df['Destination Delay Rate'] = df.groupby('Arrival IATA Code')['Departure Delay (min)'].transform('mean')

    return df

def preprocess_weather(df):
    # Calculate weather severity
    df['Weather Severity'] = np.where((df['Rain 1h'] > 0) | (df['Snow 1h'] > 0), 'Bad', 'Good')

    # Feature engineering: Create a feature for season based on month
    df['Season'] = pd.cut(df['Scheduled Departure Time'].dt.month, 
                          bins=[0, 3, 6, 9, 12], 
                          labels=['Winter', 'Spring', 'Summer', 'Fall'], 
                          right=False)

    # Feature engineering: Create a feature for visibility based on weather conditions
    df['Visibility'] = np.where((df['Weather Main'].isin(['Fog', 'Mist', 'Haze', 'Snow', 'Rain'])), 'Low', 'High')    
    return df

def unwanted_columns(df):
    # Drop unwanted columns #IATA flight number is kept
    df = df.drop(columns=['Type', 'Departure IATA Code', 'Estimated Departure Time', 
    'Actual Departure Time', 'Arrival Terminal', 'Scheduled Arrival Time', 'Estimated Arrival Time', 'Flight Number',
    'Timestamp', 'Weather Description', 'Scheduled Departure Time'])
    return df


#--------------------------------------------------------------------------------------
##### encoder decoder

def encodings_imputers(df):
    X = df.drop(columns=['Departure Delay (min)'])

    y = df['Departure Delay (min)']
  
    # Define categorical and numerical columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

    # Preprocessing for numerical data with KNNImputer
    numerical_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),  # Using 5 neighbors for imputation
        ('scaler', StandardScaler())])
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])

    # Define the model preprocessing pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    # Apply the pipeline to the dataset
    X_processed = pipeline.fit_transform(df)
    
    joblib.dump(pipeline, 'preprocessing_pipeline.pkl')
    return X_processed, y, pipeline


### experimental
def encode_new_instance(instance_values, pipeline):
    """
    Encode a new instance of data using the preprocessing pipeline.

    Parameters:
    instance_values (dict or list): Values of predictors for the new instance.
                                    It can be a dictionary where keys are column names
                                    and values are corresponding values for each column,
                                    or a list where values correspond to columns in the
                                    same order as in the original DataFrame.
    pipeline (Pipeline): Preprocessing pipeline learned from the training data.

    Returns:
    array: Encoded representation of the new instance.
    """
    # Convert instance_values to DataFrame if it's a dictionary
    if isinstance(instance_values, dict):
        instance_df = pd.DataFrame([instance_values])
    else:
        instance_df = pd.DataFrame([instance_values], columns=pipeline.named_steps['preprocessor'].get_feature_names_out())

    # Apply the preprocessing pipeline to the new instance
    encoded_instance = pipeline.transform(instance_df)

    return encoded_instance


#--------------------------------------------------------------------------------------


def get_feature_names_out(column_transformer):
    """Get output feature names for the given ColumnTransformer."""
    feature_names = []

    # Loop through each transformer within the ColumnTransformer
    for transformer_name, transformer, original_features in column_transformer.transformers_:
        if transformer_name == 'remainder':
            continue
        
        if hasattr(transformer, 'get_feature_names_out'):
            # If the transformer can generate feature names
            names = transformer.get_feature_names_out(original_features)
        else:
            # Otherwise, use the original feature names
            names = original_features
        
        feature_names.extend(names)
    
    return feature_names

def transform_output_to_df(X_processed, preprocessor, original_df):
    """Convert the output of the processing pipeline back to a pandas DataFrame."""
    feature_names = get_feature_names_out(preprocessor)
    processed_df = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed, 
                                columns=feature_names, 
                                index=original_df.index)
    return processed_df



def split_data(X_processed, y, test_size=0.2, random_state=42):
    """
    Split the DataFrame into features (X) and target variable (y),
    then split them into training and testing sets.
    
    Parameters:
    df (DataFrame): The input DataFrame containing features and target variable.
    target_column (str): The name of the target column.
    test_size (float, optional): The proportion of the dataset to include in the test split.
    random_state (int, optional): Controls the shuffling applied to the data before splitting.
    
    Returns:
    X_train (DataFrame): Training features.
    X_test (DataFrame): Testing features.
    y_train (Series): Training target.
    y_test (Series): Testing target.
    """

    # Separate features and target variable
    X = X_processed
    y = y
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

## --------------------------------------------------------------------------------------
##### Front end Functions

def display_delay_prediction(unscaled_delay):
    
    if unscaled_delay > 60:
        message = "⛔️ Don't buy this flight by any chance. There is a strong likelihood of significant delay (> 1 hour)."
    elif 30 < unscaled_delay <= 60:
        message = "⚠️ Consider booking another flight. The delay for this flight is likely to be more than 30 minutes."
    elif 0 < unscaled_delay <= 30:
        message = "✈️ There will be some delay, but if the flight is significantly cheaper than other options, you can consider booking it. Expect a delay of approximately 0-30 minutes."
    else:
        message = "🛫 This flight is expected to be on time or may experience only a minimal delay. You can proceed with booking."

    st.write('The delay for this flight is', unscaled_delay, 'minutes.')
    st.write(message)


def get_coordinates(airport):
    geolocator = Nominatim(user_agent="flight_app")
    location = geolocator.geocode(airport)
    return location.latitude, location.longitude

def plot_flight_curve(origin, destination):
    origin_coords = montreal_coords = (45.5017, -73.5673)
    dest_coords = get_coordinates(destination)
    
    m = folium.Map(location=[origin_coords[0], origin_coords[1]], zoom_start=4)
    
    # Add markers for origin and destination
    folium.Marker(location=[origin_coords[0], origin_coords[1]], popup=origin).add_to(m)
    folium.Marker(location=[dest_coords[0], dest_coords[1]], popup=destination).add_to(m)
    
    # Plot flight curve
    points = [origin_coords, dest_coords]
    folium.PolyLine(locations=points, color='blue').add_to(m)
    
    return m

#--------------------------------------------------------------------------------------

def Deployment_Feature_Creation_From_User_Input(Temp):
    # Convert 'Scheduled Departure Time' to date
    Temp['Date'] = pd.to_datetime(Temp['Scheduled Departure Time']).dt.date

    # Reorder and select columns
    Temp = Temp[['Date', 'Departure Time of Day', 'Temperature', 'Feels Like', 'Pressure', 'Humidity', 'Wind Speed',
                 'Wind Degree', 'Clouds', 'Weather Main', 'Rain 1h', 'Snow 1h',
                 'Weekday of Departure', 'Weekend Departure', 'Weather Severity', 'Season', 'Visibility']]

    # Drop duplicates based on the 'Date' column
    Temp.drop_duplicates(inplace=True)

    # Reset the index to make 'Date' a regular column again
    Temp.reset_index(drop=True, inplace=True)

    return Temp

def Match_DateTime_Fields(Temp, Date, Time_of_Day):
    # find row where date and time of day, both match
    TempW = Temp[(Temp['Date'] == Date) & (Temp['Departure Time of Day'] == Time_of_Day)]
    TempW.drop(columns=['Date'], inplace=True)
    # take first row
    TempW = TempW.head(1)
    return TempW


#--------------------------------------------------------------------------------------




# Model.py

In [105]:
# # function to unscale the value
# def unscale_target(scaled_value):
#     mean = 28.95956813
#     std = 33.56386877
#     return scaled_value * std + mean

# df = load_data()
# df = feature_engineering(df)
# df = custom_preprocess_data(df)
# X_processed, pipeline = encodings_imputers(df)
# processed_df = transform_output_to_df(X_processed, pipeline['preprocessor'], df)
# processed_df.head()
# processed_df.to_csv('processed_dataset_YUL-Flights-Weather.csv', index=False)

# df = pd.read_csv('processed_dataset_YUL-Flights-Weather.csv')
# df 




fl = load_data()
fl = preprocess_dates(fl)
fl = preprocess_weather(fl)
fl = unwanted_columns(fl)
X_processed, y, pipeline = encodings_imputers(fl)
X_train, X_test, y_train, y_test = split_data(X_processed, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42),
    # 'XGBRegressor': xgb.XGBRegressor(random_state=42),
    # 'SVR': SVR(),
    # 'Ensemble': VotingRegressor(
    #     estimators=[
    #         ('lr', LinearRegression()),
    #         ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    #         ('gb', GradientBoostingRegressor(random_state=42)),
    #         ('xgb', xgb.XGBRegressor(random_state=42)),
    #         ('svm', SVR())
    #     ]
    # )
}

# Test and validate different models
results = {}

#import sqrt
from math import sqrt


for name, model in models.items():
    mse, r2 = evaluate_model(model, X_train, y_train, X_test, y_test)
    results[name] = {'RMSE': sqrt(mse), 'R2': r2}
    print(f"{name} - RMSE: {sqrt(mse):.2f}, R2: {r2:.2f}")

# pickle the best model
best_model = RandomForestRegressor(n_estimators=100, random_state=42)
best_model.fit(X_train, y_train)
joblib.dump(best_model, 'best_model.pkl')

# # Load the model
# model = joblib.load('best_model.pkl')

LinearRegression - RMSE: 21.52, R2: 0.68
RandomForestRegressor - RMSE: 14.68, R2: 0.85
GradientBoostingRegressor - RMSE: 14.45, R2: 0.86


['best_model.pkl']

In [None]:
# Function for MLFlow tracking, comparison plotting, and logging evaluation data as a table artifact
def track_model_performance_and_plot(experiment_name, models, X_train, y_train, X_test, y_test):
    """
    Track the model performance using MLFlow, compare all models' RMSE and R^2 metrics with bar charts,
    log evaluation data as a table artifact for each model, and ensure that the MLFlow UI can be accessed.

    Parameters:
    - experiment_name: str, the name of the MLFlow experiment.
    - models: dict, a dictionary of model names and their instances.
    - X_train: training data features.
    - y_train: training data target.
    - X_test: test data features.
    - y_test: test data target.
    """

    # Set the MLFlow experiment
    mlflow.set_experiment(experiment_name)

    # Initialize lists to store RMSE and R^2 for all models
    rmses = []
    r2s = []
    model_names = []
    evaluation_data = []

    # Start MLFlow run
    for model_name, model in models.items():
        with mlflow.start_run():
            # Log model name
            mlflow.log_param("model_name", model_name)

            # Fit the model
            model.fit(X_train, y_train)

            # Make predictions
            predictions = model.predict(X_test)

            # Calculate metrics
            rmse = sqrt(mean_squared_error(y_test, predictions))
            r2 = r2_score(y_test, predictions)

            # Append metrics to lists
            rmses.append(rmse)
            r2s.append(r2)
            model_names.append(model_name)

            # Log metrics
            mlflow.log_metric(f"{model_name}_rmse", float(rmse))
            mlflow.log_metric(f"{model_name}_r2", float(r2))

            # Log model
            mlflow.sklearn.log_model(model, f"{model_name}_model")

            # Print out metrics
            print(f"Model: {model_name}")
            print(f"  RMSE: {rmse}")
            print(f"  R2: {r2}")

            # Create and log figures for predictions vs actual
            fig, ax = plt.subplots()
            sns.regplot(x=y_test, y=predictions, ax=ax)
            ax.set_title(f'Predictions vs Actual for {model_name}')
            ax.set_xlabel('Actual Values')
            ax.set_ylabel('Predicted Values')
            plt.savefig(f"{model_name}_predictions_vs_actual.png")
            mlflow.log_artifact(f"{model_name}_predictions_vs_actual.png")

            # Log evaluation data as a table artifact
            eval_data = {
                'Model Name': model_name,
                'RMSE': rmse,
                'R2': r2
            }
            evaluation_data.append(eval_data)
            eval_df = pd.DataFrame([eval_data])
            eval_df.to_csv(f"{model_name}_evaluation_data.csv", index=False)
            mlflow.log_artifact(f"{model_name}_evaluation_data.csv")

            # Set tags
            mlflow.set_tag("developer", "Team 2")

            # End MLFlow run
            mlflow.end_run()

    # Plot comparison of RMSE for all models
    plt.figure(figsize=(10, 6))
    sns.barplot(x=model_names, y=rmses, palette='viridis')
    plt.title('Comparison of Model RMSEs')
    plt.ylabel('RMSE')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig("model_rmse_comparison.png")
    mlflow.log_artifact("model_rmse_comparison.png")

    # Plot comparison of R^2 for all models
    plt.figure(figsize=(10, 6))
    sns.barplot(x=model_names, y=r2s, palette='magma')
    plt.title('Comparison of Model R^2 Scores')
    plt.ylabel('R^2 Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig("model_r2_comparison.png")
    mlflow.log_artifact("model_r2_comparison.png")

# Track and log each model's performance using MLFlow and plot comparisons
track_model_performance_and_plot(
    experiment_name="Flight_Delay_Prediction_Comparison",
    models=models,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

In [106]:
fl.columns

Index(['Departure Delay (min)', 'Arrival IATA Code', 'Airline Name',
       'Temperature', 'Feels Like', 'Pressure', 'Humidity', 'Wind Speed',
       'Wind Degree', 'Clouds', 'Weather Main', 'Rain 1h', 'Snow 1h',
       'Estimated Departure Delay (min)', 'Departure Time of Day',
       'Weekday of Departure', 'Weekend Departure', 'Airline Delay Rate',
       'Destination Delay Rate', 'Weather Severity', 'Season', 'Visibility'],
      dtype='object')

In [79]:
# show all columns
pd.set_option('display.max_columns', None)
fl.columns

Index(['Departure Delay (min)', 'Arrival IATA Code', 'Airline Name',
       'Temperature', 'Feels Like', 'Pressure', 'Humidity', 'Wind Speed',
       'Wind Degree', 'Clouds', 'Weather Main', 'Rain 1h', 'Snow 1h',
       'Estimated Departure Delay (min)', 'Departure Time of Day',
       'Weekday of Departure', 'Weekend Departure', 'Airline Delay Rate',
       'Destination Delay Rate', 'Weather Severity', 'Season', 'Visibility'],
      dtype='object')

# Lists.py

In [89]:
#import utils_v2

fl = load_data()
fl = preprocess_dates(fl)
fl = preprocess_weather(fl)
#fl = unwanted_columns(fl)

# Assuming 'fl' is the DataFrame containing the data
Temp = Deployment_Feature_Creation_From_User_Input(fl)
DateList = Temp['Date'].unique().tolist()
Time_of_Day_List = Temp['Departure Time of Day'].unique().tolist()

# unique list of arrival iata code
Arrival_IATA_Code_List = fl['Arrival IATA Code'].unique().tolist()

#unique list of airline name
Airline_Name_List = fl['Airline Name'].unique().tolist()


def match_delay_rate(airline_name):
    tempQ= fl[['Airline Name', 'Airline Delay Rate']]
    tempQ = tempQ.drop_duplicates()
    airline_delay_rate = tempQ[tempQ['Airline Name'] == airline_name]['Airline Delay Rate'].values[0]
    return airline_delay_rate

def match_dest_delay_rate(airline_name):
    tempE= fl[['Arrival IATA Code', 'Destination Delay Rate']]
    tempE = tempE.drop_duplicates()
    destination_delay_rate = tempE[tempE['Arrival IATA Code'] == airline_name]['Destination Delay Rate'].values[0]
    return destination_delay_rate

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [90]:
Temp

Unnamed: 0,Date,Departure Time of Day,Temperature,Feels Like,Pressure,Humidity,Wind Speed,Wind Degree,Clouds,Weather Main,Rain 1h,Snow 1h,Weekday of Departure,Weekend Departure,Weather Severity,Season,Visibility
0,2024-01-12,Evening,-3.62,-5.83,1023.0,58.0,1.47,25.0,14.0,Clouds,0.0,0.00,Friday,0,Good,Winter,High
1,2024-01-12,Evening,-3.95,-7.36,1022.0,58.0,2.22,71.0,31.0,Clouds,0.0,0.00,Friday,0,Good,Winter,High
2,2024-01-12,Evening,-4.90,-9.00,1022.0,59.0,2.65,57.0,26.0,Clouds,0.0,0.00,Friday,0,Good,Winter,High
3,2024-01-12,Evening,-6.69,-11.93,1021.0,73.0,3.38,43.0,39.0,Clouds,0.0,0.00,Friday,0,Good,Winter,High
4,2024-01-13,Night,-6.86,-10.80,1015.0,76.0,2.24,23.0,100.0,Snow,0.0,0.14,Saturday,1,Bad,Winter,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,2024-01-20,Afternoon,-14.38,-21.38,1015.0,93.0,4.53,255.0,98.0,Clouds,0.0,0.00,Saturday,1,Good,Winter,High
2196,2024-01-20,Afternoon,-13.40,-20.40,1014.0,75.0,5.66,280.0,100.0,Clouds,0.0,0.00,Saturday,1,Good,Winter,High
2197,2024-01-20,Afternoon,-12.70,-15.85,1015.0,76.0,1.34,270.0,99.0,Clouds,0.0,0.00,Saturday,1,Good,Winter,High
2198,2024-01-20,Afternoon,-12.06,-19.06,1015.0,75.0,4.63,280.0,100.0,Clouds,0.0,0.00,Saturday,1,Good,Winter,High


In [91]:
#columns in fl that are not in Temp
fl.columns.difference(Temp.columns)

Index(['Actual Departure Time', 'Airline Delay Rate', 'Airline Name',
       'Arrival IATA Code', 'Arrival Terminal', 'Departure Delay (min)',
       'Departure IATA Code', 'Destination Delay Rate',
       'Estimated Arrival Time', 'Estimated Departure Delay (min)',
       'Estimated Departure Time', 'Flight Number', 'Scheduled Arrival Time',
       'Scheduled Departure Time', 'Timestamp', 'Type', 'Weather Description'],
      dtype='object')

# User Input Testing

In [102]:
from utils_v2 import *
from Lists import *

selected_date = datetime.date(2024, 1, 12)
selected_TOD = 'Evening'
origin = 'YUL'
selected_destination = 'YYZ'
selected_airline = 'AIR CANADA'

Date_Related_Predictors = Match_DateTime_Fields(Date_Features, selected_date, selected_TOD)

airline_delay_rate = match_delay_rate(selected_airline)
destination_delay_rate = match_dest_delay_rate(selected_destination)

instance = pd.DataFrame({
    'Temperature': Date_Related_Predictors['Temperature'],
    'Feels Like': Date_Related_Predictors['Feels Like'],
    'Pressure': Date_Related_Predictors['Pressure'],
    'Humidity': Date_Related_Predictors['Humidity'],
    'Wind Speed': Date_Related_Predictors['Wind Speed'],
    'Wind Degree': Date_Related_Predictors['Wind Degree'],
    'Clouds': Date_Related_Predictors['Clouds'],
    'Weather Main': Date_Related_Predictors['Weather Main'],
    'Rain 1h': Date_Related_Predictors['Rain 1h'],
    'Snow 1h': Date_Related_Predictors['Snow 1h'],
    'Weekday of Departure': Date_Related_Predictors['Weekday of Departure'],
    'Weekend Departure': Date_Related_Predictors['Weekend Departure'],
    'Weather Severity': Date_Related_Predictors['Weather Severity'],
    'Season': Date_Related_Predictors['Season'],
    'Visibility': Date_Related_Predictors['Visibility'],
    'Airline Delay Rate': airline_delay_rate,
    'Destination Delay Rate': destination_delay_rate
}, index=[0])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [103]:
instance

Unnamed: 0,Temperature,Feels Like,Pressure,Humidity,Wind Speed,Wind Degree,Clouds,Weather Main,Rain 1h,Snow 1h,Weekday of Departure,Weekend Departure,Weather Severity,Season,Visibility,Airline Delay Rate,Destination Delay Rate
0,-3.62,-5.83,1023.0,58.0,1.47,25.0,14.0,Clouds,0.0,0.0,Friday,0,Good,Winter,High,27.743131,29.896607


Unnamed: 0,Type,Departure IATA Code,Departure Delay (min),Scheduled Departure Time,Estimated Departure Time,Actual Departure Time,Arrival IATA Code,Arrival Terminal,Scheduled Arrival Time,Estimated Arrival Time,Airline Name,Flight Number,Timestamp,Temperature,Feels Like,Pressure,Humidity,Wind Speed,Wind Degree,Clouds,Weather Main,Weather Description,Rain 1h,Snow 1h,Estimated Departure Delay (min),Departure Time of Day,Weekday of Departure,Weekend Departure,Airline Delay Rate,Destination Delay Rate,Weather Severity,Season,Visibility,Date
0,departure,yul,23,2024-01-12 19:00:00,2024-01-12 19:00:00,2024-01-12 19:00:00,LHR,2,2024-01-13t06:30:00.000,2024-01-13t06:11:00.000,AEGEAN AIRLINES,3089,2024-01-12 19:00:00,-3.62,-5.83,1023.0,58.0,1.47,25.0,14.0,Clouds,few clouds,0.0,0.0,0.0,Evening,Friday,0,33.936170,36.266932,Good,Winter,High,2024-01-12
1,departure,yul,33,2024-01-12 19:00:00,2024-01-12 19:00:00,2024-01-12 19:00:00,CDG,2e,2024-01-13t08:05:00.000,2024-01-13t07:45:00.000,DELTA AIR LINES,8612,2024-01-12 19:00:00,-3.62,-5.83,1023.0,58.0,1.47,25.0,14.0,Clouds,few clouds,0.0,0.0,0.0,Evening,Friday,0,29.700730,29.299674,Good,Winter,High,2024-01-12
3,departure,yul,115,2024-01-12 19:00:00,2024-01-12 21:00:00,,YYZ,1,2024-01-12t20:46:00.000,,AZUL,7807,2024-01-12 19:00:00,-3.62,-5.83,1023.0,58.0,1.47,25.0,14.0,Clouds,few clouds,0.0,0.0,120.0,Evening,Friday,0,31.860335,29.896607,Good,Winter,High,2024-01-12
4,departure,yul,30,2024-01-12 19:00:00,2024-01-12 19:00:00,,YYZ,3,2024-01-12t21:00:00.000,,PORTER,130,2024-01-12 19:00:00,-3.62,-5.83,1023.0,58.0,1.47,25.0,14.0,Clouds,few clouds,0.0,0.0,0.0,Evening,Friday,0,22.259259,29.896607,Good,Winter,High,2024-01-12
5,departure,yul,21,2024-01-12 19:00:00,2024-01-12 19:00:00,2024-01-12 19:00:00,LAS,3,2024-01-12t22:25:00.000,2024-01-12t22:38:00.000,AIR CANADA ROUGE,1709,2024-01-12 19:00:00,-3.62,-5.83,1023.0,58.0,1.47,25.0,14.0,Clouds,few clouds,0.0,0.0,0.0,Evening,Friday,0,30.979713,34.004950,Good,Winter,High,2024-01-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30819,departure,yul,21,2024-01-20 18:00:00,2024-01-20 19:00:00,,LGA,,2024-01-20t19:55:00.000,,AIR CANADA,644,2024-01-20 18:00:00,-11.95,-18.95,1014.0,80.0,5.14,260.0,100.0,Clouds,overcast clouds,0.0,0.0,60.0,Evening,Saturday,1,27.743131,26.824561,Good,Winter,High,2024-01-20
30820,departure,yul,57,2024-01-19 22:00:00,2024-01-19 23:00:00,,MBJ,,2024-01-20t06:40:00.000,,AIR TRANSAT,150,2024-01-19 22:00:00,-16.53,-22.45,1014.0,92.0,2.41,236.0,60.0,Clouds,broken clouds,0.0,0.0,60.0,Evening,Friday,0,24.720446,27.986486,Good,Winter,High,2024-01-19
30821,departure,yul,90,2024-01-20 12:00:00,2024-01-20 14:00:00,,YYZ,,2024-01-20t13:33:00.000,,AIR CANADA,7053,2024-01-20 12:00:00,-14.75,-21.75,1014.0,66.0,5.14,270.0,100.0,Clouds,overcast clouds,0.0,0.0,120.0,Afternoon,Saturday,1,27.743131,29.896607,Good,Winter,High,2024-01-20
30823,departure,yul,41,2024-01-20 15:00:00,2024-01-20 15:00:00,,YQM,,2024-01-20t17:20:00.000,,AIR CANADA,7125,2024-01-20 15:00:00,-13.40,-20.40,1014.0,75.0,5.66,280.0,100.0,Clouds,overcast clouds,0.0,0.0,0.0,Afternoon,Saturday,1,27.743131,32.570213,Good,Winter,High,2024-01-20
