# Create features

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from math import ceil

## Load data

In [2]:
data = pd.read_parquet("data/raw_data/search_fare_largest_subsequent.parquet")
data.shape

(33848879, 23)

In [None]:
flight = pd.read_parquet("data/raw_data/flight_largest_subsequent.parquet")
flight.shape

## Descriptive analysis

### Basic informations

In [None]:
data.columns

In [None]:
data.head(2)

In [None]:
flight.columns

In [None]:
flight.head(2)

### Check if flight information changes over time
Most of the time flight information doesn't change over time, but it happens sometimes

In [None]:
unique_columns = ["searchId", "legId", "insertionTime", "searchIdSearchTable"]
colunas = [col for col in flight.columns if col not in unique_columns ]
df = flight[colunas]
display(df.shape)
df = df.drop_duplicates()
df.shape

In [None]:
flight["legId"].nunique()

In [None]:
del df

In [None]:
flight.nunique()

## Delete unnecessary columns

### Fare and search table

In [None]:
nunique_data = data.nunique()
nunique_data

In [None]:
columns_to_delete = [ "searchIdSearchTable" ]
columns_to_delete += list(nunique_data[nunique_data == 1].index)

In [None]:
columns_to_check_unique_values = nunique_data[(nunique_data != 1) & (nunique_data <= 10)].index
for column in columns_to_check_unique_values:
    print(column)
    display(data[column].unique())
    print("-"*100)
    
columns_to_delete += ["hasSeatMap", "providerCode"]

In [None]:
columns_to_delete = list(set(columns_to_delete))
columns_to_delete, len(columns_to_delete)

In [None]:
data.drop(columns=columns_to_delete, inplace=True)
data.shape

In [None]:
data.head(2)

## Separation of validation and test training data

Train percentage 0.65 <br>
Validation percentage 0.20 <br>
Test percentage 0.15 <br>

In [3]:
data.operationalSearchTime.min(),\
data.operationalSearchTime.max(),\
data.operationalSearchTime.nunique()

(Timestamp('2023-06-04 23:00:00'), Timestamp('2023-06-11 00:00:00'), 146)

In [4]:
data["legId"] = data["legId"].astype('category')

data.sort_values("operationalSearchTime", inplace=True, ignore_index=True)
operationalSearchTime = data.operationalSearchTime.unique()

end_train = ceil(len(operationalSearchTime) * 0.65)
end_validation = end_train + ceil(len(operationalSearchTime) * 0.2)

train_mask = data.operationalSearchTime.isin(operationalSearchTime[:end_train])
validation_mask = data.operationalSearchTime.isin(operationalSearchTime[end_train:end_validation])
test_mask = data.operationalSearchTime.isin(operationalSearchTime[end_validation:])

train = data.loc[train_mask]
validation = data.loc[validation_mask]
test = data.loc[test_mask]

del data

## Create features

### Version 1

In [None]:
columns_version_1 = ["legId", "operationalSearchTime", "totalFare", "originCode", "destinationCode", "flightDay"]

train = train[columns_version_1].head(1_000_000)
validation = validation[columns_version_1].head(1_000_000)
test = test[columns_version_1].head(1_000_000)

#### Days until flight

In [5]:
def compute_days_util_flight(dataframe):
    data_version1["daysUntilFlight"] = (
        data_version1["flightDay"] - data_version1["operationalSearchTime"].dt.date
    )
    seconds_per_day = 24 * 60 * 60
    data_version1["daysUntilFlight"] = (
        data_version1["daysUntilFlight"].dt.total_seconds() / seconds_per_day
    ).astype(int)
    return dataframe 


In [None]:
# columns_version_1 = ["legId", "operationalSearchTime", "totalFare", "originCode", "destinationCode", "flightDay"]
# data_version1 = data[columns_version_1].head(1_000_000).copy()
# data_version1["legId"] = data_version1["legId"].astype('category')
# data_version1.groupby("legId")[["totalFare"]].shift(1)

In [None]:
# lista = ["2ba63db834c6d18b6f25a716436fe1b9", "3eea1b8aecf1baef902bece02ea0b6b1", "2b27d6480302c8de9013e721eb203511"]
# a = data_version1[data_version1.legId.isin(lista)].copy()
# a = a.iloc[[1,2,0,3,4,5,6]]
# a

#### Lag features

In [6]:
# Beware of Data Leakage !!!!!!!!!!!!!
def compute_lag_features(dataframe):
    shift_list = [1] + [12 * i for i in range(1, 12)]
    for shift_value in shift_list: 
        dataframe[f"totalFareShift{shift_value}"] = dataframe.groupby("legId")[["totalFare"]].shift(shift_value)
    return dataframe

#### Diff features

In [7]:
def compute_diff_features(dataframe):
    diff_list = [1] + [12 * i for i in range(1, 12)]
    for diff_value in diff_list: 
        dataframe[f"totalFareDiff{diff_value}"] = dataframe.groupby("legId")[["totalFare"]].diff(diff_value)
    return dataframe

#### Percentage change features

#### Rolling features

#### Cumulative features

#### Save dataframes

In [None]:
path_time_series_version_1 = "data/train_validade_teste/time_searies/version_1/"

train.to_parquet(path_time_series_version_1 + "train.parquet")
validation.to_parquet(path_time_series_version_1 + "validation.parquet")
test.to_parquet(path_time_series_version_1 + "test.parquet")