In [35]:
import os
import sys
sys.path.insert(0, '..')

from google.cloud import bigquery
from google.oauth2 import service_account

import pandas as pd
import matplotlib.pyplot as plt
from decouple import config
import json
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression

from authenticate_service_account import main
from utils import *

from sklearn.model_selection import train_test_split

from scipy import stats

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import r2_score

import category_encoders as ce
from scipy import stats

from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsRegressor

from keras.models import load_model

from keras.callbacks import EarlyStopping
from tensorflow import keras
from tensorflow.keras import layers, Sequential

from joblib import dump, load

from sklearn.ensemble import RandomForestRegressor

from scipy.stats import zscore


In [2]:
client = main()

query = """
SELECT *
FROM skyscanner-insights-343713.Itinerary_Scoring.training_set_80pct_ODs
"""

query_job = client.query(query)

results = query_job.result()

In [3]:
gcp_data = results.to_dataframe()

In [4]:
gcp_data.to_csv('../raw_data/skyscanner_data_171223', index=False)

In [5]:
raw_data = gcp_data.copy()

In [256]:
def all_preprocessing(raw_data, columns_to_process, target_creation_function, target,
                        box_cox_columns=False, yeo_johnson_columns=False, min_max_scaling=False, log_transform_columns=False,
                        od_encoding=False, operator_encoding=False,
                        target_func_param1=None, target_func_param2=None, target_func_param3=None):
    """
    This functions completes all feature engineering, target creation and scaling
    RETURNS: updated dataframe and a Class that holds all the scalers

    Notes:
    - It will only return columns in columns_to_process and the target
    """

    #DATA CLEANING

    # All int64 columns need to be float64, or some functions don't work. e.g zscore
    for column in raw_data.select_dtypes(include=['int64']).columns:
        raw_data[column] = raw_data[column].astype('float64')

    #FEATURE ENGINEERING SECTION

    # This creates a column to identify OD's
    raw_data['OD'] = raw_data['OriginCty'] + raw_data['DestinationCty']

    # This calculates the total layover time with ratio
    raw_data['total_layover_time'] = raw_data['DurationMin'] - raw_data['Total_Flight_Duration']
    raw_data['total_layover_time_ratio'] =raw_data['total_layover_time'] /raw_data['DurationMin']

    # This calculates the difference between total distance traveled and 'straight line' distance
    raw_data['extra_travel_distance'] = raw_data['Total_Flight_Distance'] - raw_data['TravelDistanceKm']
    raw_data['extra_travel_distance_ratio'] =  raw_data['Total_Flight_Distance'] / raw_data['TravelDistanceKm']

    # This drops all rows with neg layover time
    data_engineered = drop_neg_layover_time(raw_data)

    # Create the target
    processed_data = target_creation_function(data_engineered, target_func_param1, target_func_param2, target_func_param3)

    # Seperating target so encoders dont store a df shape that is larger than real-world data
    # This is so encoders do not expect the extra column when running on new data, which will not have a target
    y = processed_data[target]

    model_data = processed_data.drop(columns=[target])

    #BINARY ENCODING
    # Binary encoding origin and destination
    if od_encoding:
        o_encoder = ce.BinaryEncoder()
        origin_apt_encoded = o_encoder.fit_transform(model_data['OriginApt'])
        columns_to_process.extend(origin_apt_encoded.columns.to_list())

        d_encoder = ce.BinaryEncoder()
        destination_apt_encoded = d_encoder.fit_transform(model_data['DestinationApt'])
        columns_to_process.extend(destination_apt_encoded.columns.to_list())

        #Concatinating newly encoded columns
        origin_binary = pd.concat([model_data, origin_apt_encoded], axis=1)
        dest_binary = pd.concat([origin_binary, destination_apt_encoded], axis=1)
    else:
        o_encoder = None
        d_encoder = None
        dest_binary = model_data.copy()

    # Binary encoding Operator IATA'
    if operator_encoding:
        seg_0_encoder = ce.BinaryEncoder()
        seg_0_binary = seg_0_encoder.fit_transform(model_data['Seg_0_OperatingCarrierIATA'])
        columns_to_process.extend(seg_0_binary.columns.to_list())

        seg_1_encoder = ce.BinaryEncoder()
        seg_1_binary = seg_1_encoder.fit_transform(model_data['Seg_1_OperatingCarrierIATA'])
        columns_to_process.extend(seg_1_binary.columns.to_list())

        seg_2_encoder = ce.BinaryEncoder()
        seg_2_binary = seg_2_encoder.fit_transform(model_data['Seg_2_OperatingCarrierIATA'])
        columns_to_process.extend(seg_2_binary.columns.to_list())

        seg_3_encoder = ce.BinaryEncoder()
        seg_3_binary = seg_3_encoder.fit_transform(model_data['Seg_3_OperatingCarrierIATA'])
        columns_to_process.extend(seg_3_binary.columns.to_list())

        #Concatinating newly encoded columns
        seg0_bin = pd.concat([dest_binary, seg_0_binary], axis=1)
        seg1_bin = pd.concat([seg0_bin, seg_1_binary], axis=1)
        seg2_bin = pd.concat([seg1_bin, seg_2_binary], axis=1)
        all_binary = pd.concat([seg2_bin, seg_3_binary], axis=1)
    else:
        seg_0_encoder = None
        seg_1_encoder = None
        seg_2_encoder = None
        seg_3_encoder = None
        all_binary = dest_binary.copy()

    all_binary = all_binary[columns_to_process]

    #SCALING
    # Box cox

    # Dictionary to store best_lambda per column for new data processing
    box_lambdas = {}

    if box_cox_columns:
        for col in box_cox_columns:
            all_binary[col], box_lambda = stats.boxcox(all_binary[col])
            box_lambdas[col] = box_lambda

    # Yeo-johnson
    # Dictionary to store best_lambda per column for new data processing
    yeo_lambdas = {}

    if yeo_johnson_columns:
        for col in yeo_johnson_columns:
            all_binary[col], yeo_lambda = stats.yeojohnson(all_binary[col])
            yeo_lambdas[col] = yeo_lambda

    # Log transformations
    if log_transform_columns:
        for column in log_transform_columns:
            all_binary.loc[:, column] = np.log1p(model_data[column])

    #Min max scaling
    # Dictionary to store min max scaler per column for new data processing
    min_max_scalers = {}

    if min_max_scaling:
        for col in min_max_scaling:
            minmax_scaler = MinMaxScaler()
            all_binary[col] = minmax_scaler.fit_transform(all_binary[[col]])
            min_max_scalers[col] = minmax_scaler

    if 'dayofweek' in columns_to_process:
        # Cyclical encoding
        all_binary['sin_day'] = np.sin(2 * np.pi * all_binary['dayofweek'] / 7)
        all_binary['cos_day'] = np.cos(2 * np.pi * all_binary['dayofweek'] / 7)

        all_binary.drop(columns='dayofweek', inplace=True)

    if 'SelfTransfer' in columns_to_process:
        #Inversing the importance of SelfTransfer, so Non Self Transfer is seen as better by the model
        all_binary['SelfTransfer'] = all_binary['SelfTransfer'].apply(convert_bool_to_num)

    #STORING SCALERS
    class PreprocessScalers:
        def __init__(self, o_encoder, d_encoder, box_lambdas, yeo_lambdas, min_max_scalers,seg_0_encoder, seg_1_encoder, seg_2_encoder, seg_3_encoder):
                self.o_encoder = o_encoder
                self.d_encoder = d_encoder
                self.box_lambda = box_lambdas
                self.yeo_lambda = yeo_lambdas
                self.minmax_scaler = min_max_scalers
                self.seg_0_encoder = seg_0_encoder
                self.seg_1_encoder = seg_1_encoder
                self.seg_2_encoder = seg_2_encoder
                self.seg_3_encoder = seg_3_encoder

    scalers = PreprocessScalers(o_encoder, d_encoder, box_lambdas, yeo_lambdas, min_max_scalers,seg_0_encoder, seg_1_encoder, seg_2_encoder, seg_3_encoder)

    #Adding y into dataset
    all_binary[target] = y

    # Returning dataframe and scalers
    return all_binary, scalers

In [257]:
smaller_raw_data = raw_data[:5000].copy()

In [283]:
columns = ['Stops','DurationMin', 'total_layover_time_ratio', 'OriginApt', 'DestinationApt',
            'Total_Flight_Distance','extra_travel_distance_ratio', 'dayofweek',
            'TravelDistanceKm', 'PricePerPax', 'SelfTransfer']

box_cox_columns = ['DurationMin', 'TravelDistanceKm', 'PricePerPax']

yeo_johnson_columns = ['total_layover_time_ratio', 'Total_Flight_Distance', 'extra_travel_distance_ratio']

min_max_scaling = ['Stops']

log_transorm_cols = ['total_layover_time_ratio', 'Total_Flight_Distance', 'extra_travel_distance_ratio']

In [452]:
df, scal = all_preprocessing(smaller_raw_data, columns, scale_itin_redirects, 'Score_Z_score_0_50',
                  min_max_scaling=min_max_scaling, log_transform_columns=log_transorm_cols,
                  od_encoding=True, operator_encoding=True,
                  target_func_param1='ItineraryRedirects', target_func_param2=0, target_func_param3=50)

In [453]:
df

Unnamed: 0,Stops,DurationMin,total_layover_time_ratio,OriginApt,DestinationApt,Total_Flight_Distance,extra_travel_distance_ratio,TravelDistanceKm,PricePerPax,SelfTransfer,...,Seg_2_OperatingCarrierIATA_3,Seg_2_OperatingCarrierIATA_4,Seg_2_OperatingCarrierIATA_5,Seg_3_OperatingCarrierIATA_0,Seg_3_OperatingCarrierIATA_1,Seg_3_OperatingCarrierIATA_2,Seg_3_OperatingCarrierIATA_3,sin_day,cos_day,Score_Z_score_0_50
0,0.000000,240.0,0.000000,TLV,PRG,7.877018,0.693147,2635.0,183.202500,1,...,0,0,1,0,0,0,1,-0.974928,-0.222521,7.498985
1,0.000000,245.0,0.000000,TLV,PRG,7.877018,0.693147,2635.0,162.930000,1,...,0,0,1,0,0,0,1,-0.974928,-0.222521,6.400767
2,0.000000,250.0,0.000000,TLV,PRG,7.877018,0.693147,2635.0,136.458333,1,...,0,0,1,0,0,0,1,-0.433884,-0.900969,18.481165
3,0.000000,100.0,0.000000,TLV,RHO,6.678342,0.693147,794.0,168.915000,1,...,0,0,1,0,0,0,1,-0.433884,-0.900969,19.484019
4,0.000000,100.0,0.000000,TLV,RHO,6.678342,0.693147,794.0,172.500000,1,...,0,0,1,0,0,0,1,0.433884,-0.900969,6.133944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.333333,555.0,0.532217,TRN,AMS,7.034388,0.869747,818.0,97.290000,1,...,0,0,1,0,0,0,1,0.781831,0.623490,21.427186
4996,0.000000,85.0,0.000000,TRN,BCN,6.439350,0.693147,625.0,33.975000,1,...,0,0,1,0,0,0,1,-0.781831,0.623490,9.952808
4997,0.000000,90.0,0.000000,TRN,BCN,6.439350,0.693147,625.0,37.065000,1,...,0,0,1,0,0,0,1,-0.433884,-0.900969,5.217688
4998,0.000000,90.0,0.000000,TRN,BCN,6.439350,0.693147,625.0,50.275000,1,...,0,0,1,0,0,0,1,-0.974928,-0.222521,5.217688


# Processing new data

In [454]:
dohop_data = pd.read_csv('../raw_data/OptiFlyAi_testset.csv')

In [455]:
dohop_data

Unnamed: 0,flights,origin,destination,cnx_airport,flight_time,connection_time,dayofweek,bkgs_with_no_content,pax,bookings,booked_fare,total_distance,direct_distance,itinerary_fare
0,"U27151, U23903",BCN,PRG,MXP,190,145,7,False,6.0,6.0,47.681727,1419.022,1387.702,169.888494
1,"U24706, U24743",TLS,FCO,NTE,200,210,2,False,7.0,5.0,130.410688,1714.360,924.274,115.483231
2,"U23810, U23929",CDG,RAK,MXP,290,310,7,True,7.0,5.0,92.172184,2796.943,2134.007,114.013625
3,"TO3539, TO4630",CHQ,MAD,ORY,350,490,6,True,4.0,4.0,155.201097,3362.800,2476.722,
4,"W94498, U22325",KEF,BCN,LTN,310,175,2,True,4.0,4.0,81.447482,3050.737,2966.249,66.966718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018918,"AV4888, AV192",AXM,SJO,BOG,200,765,2,False,,,,1437.405,1115.362,
1018919,"AV61, AV5228",PTY,CUC,BOG,170,1075,2,False,,,,1156.722,771.364,
1018920,"HA5393, HA183",NRT,LIH,HNL,445,1400,5,False,,,,6314.110,5986.291,
1018921,"TO4613, TO4756",AGP,BCN,ORY,255,1010,5,False,,,,2359.038,732.107,


In [456]:
dohop_data['itinerary_fare'].fillna(dohop_data['booked_fare'], inplace=True)
dohop_data

Unnamed: 0,flights,origin,destination,cnx_airport,flight_time,connection_time,dayofweek,bkgs_with_no_content,pax,bookings,booked_fare,total_distance,direct_distance,itinerary_fare
0,"U27151, U23903",BCN,PRG,MXP,190,145,7,False,6.0,6.0,47.681727,1419.022,1387.702,169.888494
1,"U24706, U24743",TLS,FCO,NTE,200,210,2,False,7.0,5.0,130.410688,1714.360,924.274,115.483231
2,"U23810, U23929",CDG,RAK,MXP,290,310,7,True,7.0,5.0,92.172184,2796.943,2134.007,114.013625
3,"TO3539, TO4630",CHQ,MAD,ORY,350,490,6,True,4.0,4.0,155.201097,3362.800,2476.722,155.201097
4,"W94498, U22325",KEF,BCN,LTN,310,175,2,True,4.0,4.0,81.447482,3050.737,2966.249,66.966718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018918,"AV4888, AV192",AXM,SJO,BOG,200,765,2,False,,,,1437.405,1115.362,
1018919,"AV61, AV5228",PTY,CUC,BOG,170,1075,2,False,,,,1156.722,771.364,
1018920,"HA5393, HA183",NRT,LIH,HNL,445,1400,5,False,,,,6314.110,5986.291,
1018921,"TO4613, TO4756",AGP,BCN,ORY,255,1010,5,False,,,,2359.038,732.107,


In [457]:
dohop_data.isna().sum()

flights                       0
origin                        0
destination                   0
cnx_airport                3587
flight_time                   0
connection_time               0
dayofweek                     0
bkgs_with_no_content          0
pax                     1011761
bookings                1011761
booked_fare             1011761
total_distance             4179
direct_distance             592
itinerary_fare           423158
dtype: int64

In [458]:
def create_df_of_all_categories(raw_data, data_to_be_processed, column):
    """
    This creates a list of all categories, and appends it to existing data that needs processing
    it appends the data, processes it, and drops the unnecesary columns
    It then appends it back to the dataframe so it can be used in the next step
    """

    # Creating dummy data
    categories  = pd.DataFrame(raw_data[column].unique(), columns=[column])

    # data_to_be_processed = data_to_be_processed.reset_index(drop=True)
    # categories = categories.reset_index(drop=True)

    # Merged data for encoding
    merged_data = pd.concat([data_to_be_processed[[column]],categories], axis=0)

    return merged_data


In [459]:
def encoding_new_data(original_data, data_to_be_processed, column, encoder):
    """
    This function allows for accurate encoding, that matches the training data
    In order to do that, we need to give the encoder all the options it previously had, otherwise the binary encoding won't align
    RETURNS: Only the encoded columns
    """
    # Creating dummy data from the original data
    data_with_dummies = create_df_of_all_categories(original_data, data_to_be_processed, column)

    # Encoding the column
    encoded_columns = encoder.transform(data_with_dummies[column])

    # Removing dummy data
    real_data = encoded_columns[:len(data_to_be_processed)].copy()

    return real_data

In [477]:
def process_new_data(original_data, new_data, scalers, colums_to_keep,
                     box_cox_columns=False, yeo_johnson_columns=False, log_transform_columns=False, min_max_columns=False,
                     od_encoding=False, operator_encoding=False):
    """
    This function processes new data, using scalers and encoders from the training set
    It only returns the columns stated in columns_to_keep, and encoded columns if those options flipped to True
    """

    # DATA CLEANING

    # Filling the Null itinerary_fare data with booked_fare
    new_data['itinerary_fare'].fillna(new_data['booked_fare'], inplace=True)

    # Dropping data where itinerary_fare remains Null
    clean_data = new_data.dropna(subset=['itinerary_fare']).copy().reset_index()

    # FEATURE ENGINEERING
    clean_data['DurationMin'] = clean_data['flight_time'] + clean_data['connection_time']

    clean_data['total_layover_time'] = clean_data['DurationMin'] - clean_data['flight_time']
    clean_data['total_layover_time_ratio'] = clean_data['connection_time'] / clean_data['DurationMin']

    clean_data['extra_travel_distance'] = clean_data['total_distance'] - clean_data['direct_distance']
    clean_data['extra_travel_distance_ratio'] =  clean_data['total_distance'] / clean_data['direct_distance']

    if 'seg_0' not in clean_data.columns:
        clean_data['seg_0'] = 0
        clean_data['seg_1'] = 0

        for i in range(len(clean_data)):
            listtt = clean_data['flights'][i].split(',')
            clean_data['seg_0'][i] = listtt[0][:2]
            clean_data['seg_1'][i] = listtt[1].strip()[:2]

    # Renaming the columns
    col_rename_dict = {'origin': 'OriginApt', 'destination':'DestinationApt', 'days_to_travel':'TravelHorizonDays', 'total_distance':'Total_Flight_Distance',
                    'direct_distance':'TravelDistanceKm', 'connection_time':'total_layover_time', 'flight_time':'Total_Flight_Duration','itinerary_fare':'PricePerPax',
                    'seg_0':'Seg_0_OperatingCarrierIATA', 'seg_1':'Seg_1_OperatingCarrierIATA', 'seg_2':'Seg_2_OperatingCarrierIATA', 'seg_3':'Seg_3_OperatingCarrierIATA'}

    clean_data = clean_data.rename(columns=col_rename_dict).copy()

    #TEMP creating Stops and SelfTransfer data
    clean_data['Stops'] = 1
    clean_data['SelfTransfer'] = True

    #DATA CLEANING
    for column in clean_data.select_dtypes(include=['int64']).columns:
        clean_data[column] = clean_data[column].astype('float64')

    # ENCODING
    if od_encoding:
        #Binary encoding origin
        origin_encoded = encoding_new_data(original_data=original_data, data_to_be_processed=clean_data, column='OriginApt', encoder=scalers.o_encoder)

        # Binary encoding Destination
        destination_encoded = encoding_new_data(original_data, clean_data, 'DestinationApt', scalers.d_encoder)

        # Updating the dataset with the encoded columns
        clean_data = pd.concat([clean_data, origin_encoded, destination_encoded], axis=1)

        # Ensuring the columns are returned at the end of the function
        colums_to_keep.extend(origin_encoded.columns.to_list())
        colums_to_keep.extend(destination_encoded.columns.to_list())

    if operator_encoding:
        seg_0_op_iata = encoding_new_data(original_data, clean_data, 'Seg_0_OperatingCarrierIATA', scalers.seg_0_encoder)
        seg_1_op_iata = encoding_new_data(original_data, clean_data, 'Seg_1_OperatingCarrierIATA', scalers.seg_1_encoder)

        if 'Seg_2_OperatingCarrierIATA' in clean_data.columns:
            seg_2_op_iata = encoding_new_data(original_data, clean_data, 'Seg_2_OperatingCarrierIATA', scalers.seg_2_encoder)
        else:
            seg_2_op_iata = False

        if 'Seg_3_OperatingCarrierIATA' in clean_data.columns:
            seg_3_op_iata = encoding_new_data(original_data, clean_data, 'Seg_3_OperatingCarrierIATA', scalers.seg_3_encoder)
        else:
            seg_3_op_iata = False

        # Updating the dataset with the encoded columns

        dfs_to_concat = [clean_data, seg_0_op_iata, seg_1_op_iata]

        if seg_2_op_iata:
            dfs_to_concat.append(seg_2_op_iata)
        if seg_3_op_iata:
            dfs_to_concat.append(seg_3_op_iata)

        clean_data = pd.concat(dfs_to_concat, axis=1)

        # Ensuring the columns are returned at the end of the function
        if operator_encoding:
            colums_to_keep.extend(seg_0_op_iata.columns.to_list())
            colums_to_keep.extend(seg_1_op_iata.columns.to_list())
            if seg_2_op_iata:
                colums_to_keep.extend(seg_2_op_iata.columns.to_list())
            if seg_3_op_iata:
                colums_to_keep.extend(seg_3_op_iata.columns.to_list())


    # SCALING
    # Box cox
    if box_cox_columns:
        for col in box_cox_columns:
            clean_data.loc[:,col]  = stats.boxcox(clean_data[col], lmbda=scalers.box_lambda[col])

    # Yeo-johnson
    if yeo_johnson_columns:
        for col in yeo_johnson_columns:
            clean_data.loc[:,col] = stats.yeojohnson(clean_data[col], lmbda=scalers.yeo_lambda[col])

    # Log transformations
    if log_transform_columns:
        for col in log_transform_columns:
            clean_data.loc[:,col] = np.log1p(clean_data[col])

    #Min max scaling
    if min_max_columns:
        for col in min_max_columns:
            clean_data.loc[:,col] = scalers.minmax_scaler[col].transform(clean_data[[col]])

    if 'SelfTransfer' in colums_to_keep:
        #Inversing the importance of SelfTransfer, so Non Self Transfer is seen as better by the model
        clean_data['SelfTransfer'] = clean_data['SelfTransfer'].apply(convert_bool_to_num)

    data_to_return = clean_data[colums_to_keep].copy()

    if 'dayofweek' in colums_to_keep:
        # Cyclical encoding
        data_to_return['sin_day'] = np.sin(2 * np.pi * data_to_return['dayofweek'] / 7)
        data_to_return['cos_day'] = np.cos(2 * np.pi * data_to_return['dayofweek'] / 7)

        # Dropping day of week as it is no longer neccesary
        data_to_return = data_to_return.drop(columns=['dayofweek']).copy()

    return data_to_return

In [480]:
columns_2 = ['Stops','DurationMin', 'total_layover_time_ratio', 'OriginApt', 'DestinationApt',
            'Total_Flight_Distance','extra_travel_distance_ratio', 'dayofweek',
            'TravelDistanceKm', 'PricePerPax', 'SelfTransfer',
            'Seg_0_OperatingCarrierIATA', 'Seg_1_OperatingCarrierIATA']

box_cox_columns = ['DurationMin', 'TravelDistanceKm', 'PricePerPax']

yeo_johnson_columns = ['total_layover_time_ratio', 'Total_Flight_Distance', 'extra_travel_distance_ratio']

min_max_scaling = ['Stops']

log_transorm_cols = ['total_layover_time_ratio', 'Total_Flight_Distance', 'extra_travel_distance_ratio']

In [481]:
process_new_data(smaller_raw_data, dohop_data, scal, columns_2, min_max_columns=min_max_scaling,
                 log_transform_columns=log_transorm_cols, od_encoding=True, operator_encoding=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_data['seg_0'][i] = listtt[0][:2]
  clean_data['seg_0'][i] = listtt[0][:2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_data['seg_1'][i] = listtt[1].strip()[:2]
  clean_data['seg_1'][i] = listtt[1].strip()[:2]


Unnamed: 0,Stops,DurationMin,total_layover_time_ratio,OriginApt,DestinationApt,Total_Flight_Distance,extra_travel_distance_ratio,TravelDistanceKm,PricePerPax,SelfTransfer,...,Seg_1_OperatingCarrierIATA_0,Seg_1_OperatingCarrierIATA_1,Seg_1_OperatingCarrierIATA_2,Seg_1_OperatingCarrierIATA_3,Seg_1_OperatingCarrierIATA_4,Seg_1_OperatingCarrierIATA_5,Seg_1_OperatingCarrierIATA_6,Seg_1_OperatingCarrierIATA_7,sin_day,cos_day
0,0.333333,335.0,0.359656,BCN,PRG,7.258428,0.704369,1387.702,169.888494,0,...,0,0,1,1,1,1,1,1,-2.449294e-16,1.000000
1,0.333333,410.0,0.413562,TLS,FCO,7.447378,1.049008,924.274,115.483231,0,...,0,0,1,1,1,1,1,1,9.749279e-01,-0.222521
2,0.333333,600.0,0.416515,CDG,RAK,7.936640,0.837530,2134.007,114.013625,0,...,0,0,1,1,1,1,1,1,-2.449294e-16,1.000000
3,0.333333,840.0,0.459532,CHQ,MAD,8.120827,0.857713,2476.722,155.201097,0,...,0,0,0,0,0,0,0,0,-7.818315e-01,0.623490
4,0.333333,485.0,0.308091,KEF,BCN,8.023466,0.707288,2966.249,66.966718,0,...,0,0,1,1,1,1,1,1,9.749279e-01,-0.222521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595760,0.333333,638.0,0.302076,LGB,KOA,8.387140,0.735085,4042.931,276.785000,0,...,0,1,1,0,0,0,0,0,-4.338837e-01,-0.900969
595761,0.333333,1150.0,0.259014,QSF,RUN,9.286133,0.836533,8244.273,528.785000,0,...,0,0,1,1,0,0,0,0,4.338837e-01,-0.900969
595762,0.333333,780.0,0.552263,DUB,BIQ,7.311940,0.828742,1160.135,228.980000,0,...,0,0,0,0,0,0,0,0,-9.749279e-01,-0.222521
595763,0.333333,1380.0,0.340604,JNB,CTA,9.243756,0.890267,7200.826,649.425000,0,...,0,0,0,0,0,0,0,0,-7.818315e-01,0.623490
