# Create features

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from math import ceil

## Load data

In [2]:
data = pd.read_parquet("search_fare_largest_subsequent.parquet")
data.shape

(33848879, 23)

In [3]:
flight = pd.read_parquet("flight_largest_subsequent.parquet")
flight.shape

(33848879, 23)

## Descriptive analysis

### Basic informations

In [4]:
data.columns

Index(['searchId', 'legId', 'fareBasisCode', 'isBasicEconomy', 'isRefundable',
       'isFreeChangeAvailable', 'taxes', 'fees', 'showFees', 'currency',
       'baseFare', 'totalFare', 'numberOfTickets', 'freeCancellationBy',
       'hasSeatMap', 'providerCode', 'seatsRemaining', 'searchIdSearchTable',
       'searchTime', 'operationalSearchTime', 'flightDay', 'originCode',
       'destinationCode'],
      dtype='object')

In [5]:
data.head(2)

Unnamed: 0,searchId,legId,fareBasisCode,isBasicEconomy,isRefundable,isFreeChangeAvailable,taxes,fees,showFees,currency,...,freeCancellationBy,hasSeatMap,providerCode,seatsRemaining,searchIdSearchTable,searchTime,operationalSearchTime,flightDay,originCode,destinationCode
0,67435507,424cd2c79592022c6290a4dbf4bb7072,YJ0U0G1,False,False,False,40.6,0.0,True,USD,...,2023-06-12 23:59:00,False||False,Amadeus||Amadeus,7,67435507,2023-06-09 01:21:47.870182,2023-06-09 01:00:00,2023-07-11,GIG,BSB
1,67316111,08978a96520c5df3936c6bc02e4ce991,ANAQAG2G,False,False,False,6.0,0.0,True,USD,...,2023-06-12 23:59:00,False||False,Amadeus||Amadeus,4,67316111,2023-06-09 01:00:16.159655,2023-06-09 01:00:00,2023-06-10,GRU,POA


In [6]:
flight.columns

Index(['searchId', 'legId', 'travelDuration', 'duration', 'durationInSeconds',
       'elapsedDays', 'isNonStop', 'departureTimeRaw',
       'departureTimeZoneOffsetSeconds', 'arrivalTimeRaw',
       'arrivalTimeZoneOffsetSeconds', 'flightNumber', 'stops', 'airlineCode',
       'equipmentCode', 'arrivalAirportLatitude', 'arrivalAirportLongitude',
       'departureAirportLatitude', 'departureAirportLongitude',
       'arrivalAirportCode', 'departureAirportCode', 'insertionTime',
       'searchIdSearchTable'],
      dtype='object')

In [7]:
flight.head(2)

Unnamed: 0,searchId,legId,travelDuration,duration,durationInSeconds,elapsedDays,isNonStop,departureTimeRaw,departureTimeZoneOffsetSeconds,arrivalTimeRaw,...,airlineCode,equipmentCode,arrivalAirportLatitude,arrivalAirportLongitude,departureAirportLatitude,departureAirportLongitude,arrivalAirportCode,departureAirportCode,insertionTime,searchIdSearchTable
0,67315868,7f6dcd03b4d8e7a35fec9ea63e7ca7a4,PT19H55M,PT1H15M||PT2H10M||PT2H55M,4500||7800||10500,0||0||0,False,2023-06-10T16:20:00.000-03:00||2023-06-11T05:5...,-10800||-10800||-10800,2023-06-10T17:35:00.000-03:00||2023-06-11T08:0...,...,G3||G3||G3,738||738||7M8,-27.664536||-15.869962||-3.03173,-48.545072||-47.921743||-60.046094,-23.626789||-27.664536||-15.869962,-46.659618||-48.545072||-47.921743,FLN||BSB||MAO,CGH||FLN||BSB,2023-06-20 17:47:55.712789,67315868
1,59193725,d648e5b25c6533af9ead0fe2635f3000,PT11H20M,PT2H||PT1H10M,7200||4200,0||0,False,2023-06-26T08:30:00.000-03:00||2023-06-26T18:4...,-10800||-10800,2023-06-26T10:30:00.000-03:00||2023-06-26T19:5...,...,G3||G3,738||7M8,-25.536199||-23.425717,-49.173991||-46.481788,-15.869962||-25.536199,-47.921743||-49.173991,CWB||GRU,BSB||CWB,2023-06-20 13:06:44.540302,59193725


### Check if flight information changes over time
Most of the time flight information doesn't change over time, but it happens sometimes

In [24]:
unique_columns = ["searchId", "legId", "insertionTime", "searchIdSearchTable"]
colunas = [col for col in flight.columns if col not in unique_columns ]
df = flight[colunas]
display(df.shape)
df = df.drop_duplicates()
df.shape

(33848879, 19)

(433814, 19)

In [26]:
flight["legId"].nunique()

396100

In [27]:
del df

In [28]:
flight.nunique()

searchId                          33848879
legId                               396100
travelDuration                         500
duration                              3083
durationInSeconds                     3083
elapsedDays                             17
isNonStop                                2
departureTimeRaw                    339355
departureTimeZoneOffsetSeconds          20
arrivalTimeRaw                      341030
arrivalTimeZoneOffsetSeconds            20
flightNumber                         32320
stops                                    4
airlineCode                              8
equipmentCode                          138
arrivalAirportLatitude                 666
arrivalAirportLongitude                666
departureAirportLatitude               669
departureAirportLongitude              669
arrivalAirportCode                     666
departureAirportCode                   669
insertionTime                         1696
searchIdSearchTable               33848879
dtype: int6

## Delete unnecessary columns

### Fare and search table

In [8]:
nunique_data = data.nunique()
nunique_data

searchId                 33848879
legId                      396100
fareBasisCode                 848
isBasicEconomy                  1
isRefundable                    1
isFreeChangeAvailable           1
taxes                         161
fees                            1
showFees                        1
currency                        1
baseFare                     1229
totalFare                    9143
numberOfTickets                 1
freeCancellationBy              5
hasSeatMap                      4
providerCode                    5
seatsRemaining                  8
searchIdSearchTable      33848879
searchTime                 590858
operationalSearchTime         146
flightDay                      67
originCode                      9
destinationCode                 9
dtype: int64

In [9]:
columns_to_delete = [ "searchIdSearchTable" ]
columns_to_delete += list(nunique_data[nunique_data == 1].index)

In [10]:
columns_to_check_unique_values = nunique_data[(nunique_data != 1) & (nunique_data <= 10)].index
for column in columns_to_check_unique_values:
    print(column)
    display(data[column].unique())
    print("-"*100)
    
columns_to_delete += ["hasSeatMap", "providerCode"]

freeCancellationBy


array(['2023-06-12T23:59:00.000000000', '2023-06-08T23:59:00.000000000',
       '2023-06-09T23:59:00.000000000', '2023-06-06T23:59:00.000000000',
       '2023-06-07T23:59:00.000000000'], dtype='datetime64[ns]')

----------------------------------------------------------------------------------------------------
hasSeatMap


array(['False||False', 'False||False||False', 'False',
       'False||False||False||False'], dtype=object)

----------------------------------------------------------------------------------------------------
providerCode


array(['Amadeus||Amadeus', 'Amadeus||Amadeus||Amadeus', 'Amadeus',
       'Amadeus||', 'Amadeus||Amadeus||', None], dtype=object)

----------------------------------------------------------------------------------------------------
seatsRemaining


array([7, 4, 5, 3, 6, 1, 2, 9])

----------------------------------------------------------------------------------------------------
originCode


array(['GIG', 'GRU', 'CGH', 'POA', 'BSB', 'MAO', 'SSA', 'SDU', 'CNF'],
      dtype=object)

----------------------------------------------------------------------------------------------------
destinationCode


array(['BSB', 'POA', 'MAO', 'GIG', 'CNF', 'SDU', 'GRU', 'SSA', 'CGH'],
      dtype=object)

----------------------------------------------------------------------------------------------------


In [11]:
columns_to_delete = list(set(columns_to_delete))
columns_to_delete, len(columns_to_delete)

(['searchIdSearchTable',
  'showFees',
  'isBasicEconomy',
  'currency',
  'hasSeatMap',
  'numberOfTickets',
  'isRefundable',
  'isFreeChangeAvailable',
  'providerCode',
  'fees'],
 10)

In [12]:
data.drop(columns=columns_to_delete, inplace=True)
data.shape

(33848879, 13)

In [22]:
data.head(2)

Unnamed: 0,searchId,legId,fareBasisCode,taxes,baseFare,totalFare,freeCancellationBy,seatsRemaining,searchTime,operationalSearchTime,flightDay,originCode,destinationCode
0,67435507,424cd2c79592022c6290a4dbf4bb7072,YJ0U0G1,40.6,1399.0,1439.6,2023-06-12 23:59:00,7,2023-06-09 01:21:47.870182,2023-06-09 01:00:00,2023-07-11,GIG,BSB
1,67316111,08978a96520c5df3936c6bc02e4ce991,ANAQAG2G,6.0,600.0,606.0,2023-06-12 23:59:00,4,2023-06-09 01:00:16.159655,2023-06-09 01:00:00,2023-06-10,GRU,POA


## Create features

## Separation of validation and test training data

### Time series

#### Version 1 

Train percentage 0.65 <br>
Validation percentage 0.20 <br>
Test percentage 0.15 <br>

In [14]:
time_series_columns = ["legId", "operationalSearchTime", "totalFare"]
time_series_data = data[time_series_columns]
time_series_data.sort_values("operationalSearchTime", inplace=True, ignore_index=True)

In [18]:
time_series_data.operationalSearchTime.min(),\
time_series_data.operationalSearchTime.max(),\
time_series_data.operationalSearchTime.nunique()

(Timestamp('2023-06-04 23:00:00'), Timestamp('2023-06-11 00:00:00'), 146)

In [19]:
operationalSearchTime = time_series_data.operationalSearchTime.unique()

In [34]:
end_train = ceil(len(operationalSearchTime) * 0.65)
end_validation = end_train + ceil(len(operationalSearchTime) * 0.2)

In [34]:
train_mask = time_series_data.operationalSearchTime.isin(operationalSearchTime[:end_train])
validation_mask = time_series_data.operationalSearchTime.isin(operationalSearchTime[end_train:end_validation])
test_mask = time_series_data.operationalSearchTime.isin(operationalSearchTime[end_validation:])

In [37]:
train = time_series_data.loc[train_mask]
validation = time_series_data.loc[validation_mask]
test = time_series_data.loc[test_mask]

In [39]:
path_time_series_version_1 = "data/train_validade_teste/time_searies/version_1/"

In [40]:
train.to_parquet(path_time_series_version_1 + "train.parquet")
validation.to_parquet(path_time_series_version_1 + "validation.parquet")
test.to_parquet(path_time_series_version_1 + "test.parquet")