In [35]:
import os
import sys
sys.path.insert(0, '..')

from google.cloud import bigquery
from google.oauth2 import service_account

import pandas as pd
import matplotlib.pyplot as plt
from decouple import config
import json
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression

from authenticate_service_account import main
from utils import *

from sklearn.model_selection import train_test_split

from scipy import stats

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import r2_score

import category_encoders as ce
from scipy import stats

from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsRegressor

from keras.models import load_model

from keras.callbacks import EarlyStopping
from tensorflow import keras
from tensorflow.keras import layers, Sequential

from joblib import dump, load

from sklearn.ensemble import RandomForestRegressor

from scipy.stats import zscore


In [2]:
client = main()

query = """
SELECT *
FROM skyscanner-insights-343713.Itinerary_Scoring.training_set_80pct_ODs
"""

query_job = client.query(query)

results = query_job.result()

In [3]:
gcp_data = results.to_dataframe()

In [4]:
gcp_data.to_csv('../raw_data/skyscanner_data_171223', index=False)

In [5]:
raw_data = gcp_data.copy()

In [8]:
raw_data.dtypes

OriginApt                      object
OriginCty                      object
OriginCtry                     object
DestinationApt                 object
DestinationCty                 object
DestinationCtry                object
TravelHorizonDays               Int64
TravelDistanceKm                Int64
SelfTransfer                  boolean
Stops                           Int64
DurationMin                     Int64
dayofweek                       Int64
Seg_0_OperatingCarrierIATA     object
Seg_1_OperatingCarrierIATA     object
Seg_2_OperatingCarrierIATA     object
Seg_3_OperatingCarrierIATA     object
Total_Flight_Distance           Int64
Total_Flight_Duration           Int64
passengers                      Int64
PricePerPax                   float64
ItineraryRedirects              Int64
ODRedirects                     Int64
dtype: object

In [171]:
def all_preprocessing(raw_data, columns_to_process, target_creation_function, target,
                        box_cox_columns=False, yeo_johnson_columns=False, min_max_scaling=False, log_transform_columns=False,
                        od_encoding=False, operator_encoding=False,
                        target_func_param1=None, target_func_param2=None, target_func_param3=None):
    """
    This functions completes all feature engineering, target creation and scaling
    RETURNS: updated dataframe and a Class that holds all the scalers

    Notes:
    - It will only return columns in columns_to_process and the target
    """

    #DATA CLEANING

    # All int64 columns need to be float64, or some functions don't work. e.g zscore
    for column in raw_data.select_dtypes(include=['int64']).columns:
        raw_data[column] = raw_data[column].astype('float64')

    #FEATURE ENGINEERING SECTION

    # This creates a column to identify OD's
    raw_data['OD'] = raw_data['OriginCty'] + raw_data['DestinationCty']

    # This calculates the total layover time with ratio
    raw_data['total_layover_time'] = raw_data['DurationMin'] - raw_data['Total_Flight_Duration']
    raw_data['total_layover_time_ratio'] =raw_data['total_layover_time'] /raw_data['DurationMin']

    # This calculates the difference between total distance traveled and 'straight line' distance
    raw_data['extra_travel_distance'] = raw_data['Total_Flight_Distance'] - raw_data['TravelDistanceKm']
    raw_data['extra_travel_distance_ratio'] =  raw_data['Total_Flight_Distance'] / raw_data['TravelDistanceKm']

    # This drops all rows with neg layover time
    data_engineered = drop_neg_layover_time(raw_data)

    # Create the target
    processed_data = target_creation_function(data_engineered, target_func_param1, target_func_param2, target_func_param3)

    # Seperating target so encoders dont store a df shape that is larger than real-world data
    # This is so encoders do not expect the extra column when running on new data, which will not have a target
    y = processed_data[target]

    model_data = processed_data.drop(columns=[target])

    #BINARY ENCODING
    # Binary encoding origin and destination
    if od_encoding:
        o_encoder = ce.BinaryEncoder()
        origin_apt_encoded = o_encoder.fit_transform(model_data['OriginApt'])
        columns_to_process.extend(origin_apt_encoded.columns.to_list())

        d_encoder = ce.BinaryEncoder()
        destination_apt_encoded = d_encoder.fit_transform(model_data['DestinationApt'])
        columns_to_process.extend(destination_apt_encoded.columns.to_list())

        #Concatinating newly encoded columns
        origin_binary = pd.concat([model_data, origin_apt_encoded], axis=1)
        dest_binary = pd.concat([origin_binary, destination_apt_encoded], axis=1)
    else:
        o_encoder = None
        d_encoder = None
        dest_binary = model_data.copy()

    # Binary encoding Operator IATA'
    if operator_encoding:
        seg_0_encoder = ce.BinaryEncoder()
        seg_0_binary = seg_0_encoder.fit_transform(model_data['Seg_0_OperatingCarrierIATA'])
        columns_to_process.extend(seg_0_binary.columns.to_list())

        seg_1_encoder = ce.BinaryEncoder()
        seg_1_binary = seg_1_encoder.fit_transform(model_data['Seg_1_OperatingCarrierIATA'])
        columns_to_process.extend(seg_1_binary.columns.to_list())

        seg_2_encoder = ce.BinaryEncoder()
        seg_2_binary = seg_2_encoder.fit_transform(model_data['Seg_2_OperatingCarrierIATA'])
        columns_to_process.extend(seg_2_binary.columns.to_list())

        seg_3_encoder = ce.BinaryEncoder()
        seg_3_binary = seg_3_encoder.fit_transform(model_data['Seg_3_OperatingCarrierIATA'])
        columns_to_process.extend(seg_3_binary.columns.to_list())

        #Concatinating newly encoded columns
        seg0_bin = pd.concat([dest_binary, seg_0_binary], axis=1)
        seg1_bin = pd.concat([seg0_bin, seg_1_binary], axis=1)
        seg2_bin = pd.concat([seg1_bin, seg_2_binary], axis=1)
        all_binary = pd.concat([seg2_bin, seg_3_binary], axis=1)
    else:
        seg_0_encoder = None
        seg_1_encoder = None
        seg_2_encoder = None
        seg_3_encoder = None
        all_binary = dest_binary.copy()

    all_binary = all_binary[columns_to_process]

    #SCALING
    # Box cox

    # Dictionary to store best_lambda per column for new data processing
    box_lambdas = {}

    if box_cox_columns:
        for col in box_cox_columns:
            all_binary[col], box_lambda = stats.boxcox(all_binary[col])
            box_lambdas[col] = box_lambda

    # Yeo-johnson
    # Dictionary to store best_lambda per column for new data processing
    yeo_lambdas = {}

    if yeo_johnson_columns:
        for col in yeo_johnson_columns:
            all_binary[col], yeo_lambda = stats.yeojohnson(all_binary[col])
            yeo_lambdas[col] = yeo_lambda

    # Log transformations
    if log_transform_columns:
        for column in log_transform_columns:
            all_binary.loc[:, column] = np.log1p(model_data[column])

    #Min max scaling
    # Dictionary to store min max scaler per column for new data processing
    min_max_scalers = {}

    if min_max_scaling:
        for col in min_max_scaling:
            minmax_scaler = MinMaxScaler()
            all_binary[col] = minmax_scaler.fit_transform(all_binary[[col]])
            min_max_scalers[col] = minmax_scaler


    # Cyclical encoding
    all_binary['sin_day'] = np.sin(2 * np.pi * all_binary['dayofweek'] / 7)
    all_binary['cos_day'] = np.cos(2 * np.pi * all_binary['dayofweek'] / 7)

    all_binary.drop(columns='dayofweek', inplace=True)

    #Inversing the importance of SelfTransfer, so Non Self Transfer is seen as better by the model
    all_binary['SelfTransfer'] = all_binary['SelfTransfer'].apply(convert_bool_to_num)

    #STORING SCALERS
    class PreprocessScalers:
        def __init__(self, o_encoder, d_encoder, box_lambdas, yeo_lambdas, min_max_scalers,seg_0_encoder, seg_1_encoder, seg_2_encoder, seg_3_encoder):
                self.o_encoder = o_encoder
                self.d_encoder = d_encoder
                self.box_lambda = box_lambdas
                self.yeo_lambda = yeo_lambdas
                self.minmax_scaler = min_max_scalers
                self.seg_0_encoder = seg_0_encoder
                self.seg_1_encoder = seg_1_encoder
                self.seg_2_encoder = seg_2_encoder
                self.seg_3_encoder = seg_3_encoder

    scalers = PreprocessScalers(o_encoder, d_encoder, box_lambdas, yeo_lambdas, minmax_scaler,seg_0_encoder, seg_1_encoder, seg_2_encoder, seg_3_encoder)

    #Adding y into dataset
    all_binary[target] = y

    # Returning dataframe and scalers
    return all_binary, scalers

In [172]:
smaller_raw_data = raw_data[:5000].copy()

In [173]:
columns = ['Stops','DurationMin', 'total_layover_time_ratio', 'OriginApt', 'DestinationApt',
            'Total_Flight_Distance','extra_travel_distance_ratio', 'TravelHorizonDays', 'dayofweek',
            'TravelDistanceKm', 'PricePerPax', 'SelfTransfer']

box_cox_columns = ['DurationMin', 'TravelDistanceKm', 'PricePerPax']

yeo_johnson_columns = ['total_layover_time_ratio', 'Total_Flight_Distance', 'extra_travel_distance_ratio']

min_max_scaling = ['TravelHorizonDays','Stops']

log_transorm_cols = ['total_layover_time_ratio', 'Total_Flight_Distance', 'extra_travel_distance_ratio']

In [174]:
smaller_raw_data.columns

Index(['OriginApt', 'OriginCty', 'OriginCtry', 'DestinationApt',
       'DestinationCty', 'DestinationCtry', 'TravelHorizonDays',
       'TravelDistanceKm', 'SelfTransfer', 'Stops', 'DurationMin', 'dayofweek',
       'Seg_0_OperatingCarrierIATA', 'Seg_1_OperatingCarrierIATA',
       'Seg_2_OperatingCarrierIATA', 'Seg_3_OperatingCarrierIATA',
       'Total_Flight_Distance', 'Total_Flight_Duration', 'passengers',
       'PricePerPax', 'ItineraryRedirects', 'ODRedirects'],
      dtype='object')

In [175]:
# for column in smaller_raw_data.select_dtypes(include=['int64']).columns:
#     smaller_raw_data[column] = smaller_raw_data[column].astype('float64')

In [176]:
df, scal = all_preprocessing(smaller_raw_data, columns, scale_itin_redirects, 'Score_Z_score_0_50',
                  min_max_scaling=min_max_scaling, log_transform_columns=log_transorm_cols,
                  od_encoding=True, operator_encoding=False,
                  target_func_param1='ItineraryRedirects', target_func_param2=0, target_func_param3=50)

In [177]:
df

Unnamed: 0,Stops,DurationMin,total_layover_time_ratio,OriginApt,DestinationApt,Total_Flight_Distance,extra_travel_distance_ratio,TravelHorizonDays,TravelDistanceKm,PricePerPax,...,DestinationApt_2,DestinationApt_3,DestinationApt_4,DestinationApt_5,DestinationApt_6,DestinationApt_7,DestinationApt_8,sin_day,cos_day,Score_Z_score_0_50
0,0.000000,240.0,0.000000,TLV,PRG,7.877018,0.693147,0.744444,2635.0,183.202500,...,0,0,0,0,0,0,1,-0.974928,-0.222521,7.498985
1,0.000000,245.0,0.000000,TLV,PRG,7.877018,0.693147,0.744444,2635.0,162.930000,...,0,0,0,0,0,0,1,-0.974928,-0.222521,6.400767
2,0.000000,250.0,0.000000,TLV,PRG,7.877018,0.693147,0.744444,2635.0,136.458333,...,0,0,0,0,0,0,1,-0.433884,-0.900969,18.481165
3,0.000000,100.0,0.000000,TLV,RHO,6.678342,0.693147,0.755556,794.0,168.915000,...,0,0,0,0,0,1,0,-0.433884,-0.900969,19.484019
4,0.000000,100.0,0.000000,TLV,RHO,6.678342,0.693147,0.755556,794.0,172.500000,...,0,0,0,0,0,1,0,0.433884,-0.900969,6.133944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.333333,555.0,0.532217,TRN,AMS,7.034388,0.869747,0.166667,818.0,97.290000,...,0,0,0,1,1,1,0,0.781831,0.623490,21.427186
4996,0.000000,85.0,0.000000,TRN,BCN,6.439350,0.693147,0.755556,625.0,33.975000,...,0,1,0,1,0,0,1,-0.781831,0.623490,9.952808
4997,0.000000,90.0,0.000000,TRN,BCN,6.439350,0.693147,0.755556,625.0,37.065000,...,0,1,0,1,0,0,1,-0.433884,-0.900969,5.217688
4998,0.000000,90.0,0.000000,TRN,BCN,6.439350,0.693147,0.755556,625.0,50.275000,...,0,1,0,1,0,0,1,-0.974928,-0.222521,5.217688


In [81]:
def create_df_of_all_categories(raw_data, data_to_be_processed, column):
    """
    This creates a list of all categories, and appends it to existing data that needs processing
    it appends the data, processes it, and drops the unnecesary columns
    It then appends it back to the dataframe so it can be used in the next step
    """

    # Creating dummy data
    categories  = pd.DataFrame(raw_data[column].unique(), columns=[column])

    #Isolating the important data
    # data_for_processing = data_to_be_processed[[column]].copy()

    # Merged data for encoding
    merged_data = pd.concat([data_to_be_processed,categories])

    return merged_data


In [None]:
def

In [None]:
def process_new_data(original_data, new_data, PreprocessScalers, box_cox_columns, yeo_johnson_columns):


    #Binary encoding origin
    df_binary = PreprocessScalers.o_encoder.transform(new_data)

    print(df_binary.shape)

    # Binary encoding Destination
    all_destination_cats = create_df_of_all_categories(original_data, df_binary, 'DestinationApt')
    all_binary = PreprocessScalers.d_encoder.transform(all_destination_cats)
    all_binary.dropna(subset='Stops', inplace=True)

    print(all_binary.shape)

    # Scaling other features

    # Box cox
    for col in box_cox_columns:
        all_binary[col]  = stats.boxcox(all_binary[col], lmbda=PreprocessScalers.box_lambda)

    print(all_binary.shape)

    # Yeo-johnson
    for col in yeo_johnson_columns:
        all_binary[col] = stats.yeojohnson(all_binary[col], lmbda=PreprocessScalers.yeo_lambda)

    print(all_binary.shape)

    #Min max scaling
    all_binary[min_max_scaling] = PreprocessScalers.minmax_scaler.transform(all_binary[min_max_scaling])

    print(all_binary.shape)

    # Cyclical encoding
    all_binary['sin_day'] = np.sin(2 * np.pi * all_binary['dayofweek'] / 7)
    all_binary['cos_day'] = np.cos(2 * np.pi * all_binary['dayofweek'] / 7)

    all_binary.drop(columns=['dayofweek'], inplace=True)

    print(all_binary.shape)

    all_binary['SelfTransfer'] = all_binary['SelfTransfer'].apply(convert_bool_to_num)


    # # Binary encoding Operator IATA'
    # seg_0_cats = create_df_of_all_categories(original_data, all_binary, 'Seg_0_OperatingCarrierIATA')
    # seg_0_binary = PreprocessScalers.seg_0_encoder.transform(seg_0_cats)
    # seg_0_binary.dropna(subset='Stops', inplace=True)

    # seg_1_cats = create_df_of_all_categories(original_data, seg_0_binary, 'Seg_1_OperatingCarrierIATA')
    # seg_1_binary = PreprocessScalers.seg_1_encoder.transform(seg_1_cats)
    # seg_1_binary.dropna(subset='Stops', inplace=True)

    # seg_2_cats = create_df_of_all_categories(original_data, seg_1_binary, 'Seg_2_OperatingCarrierIATA')
    # seg_2_binary = PreprocessScalers.seg_2_encoder.transform(seg_2_cats)
    # seg_2_binary.dropna(subset='Stops', inplace=True)

    # seg_3_cats = create_df_of_all_categories(original_data, seg_2_binary, 'Seg_3_OperatingCarrierIATA')
    # seg_3_binary = PreprocessScalers.seg_3_encoder.transform(seg_3_cats)
    # seg_3_binary.dropna(subset='Stops', inplace=True)

    return all_binary