In [None]:
# Install libraries if necessary
!pip install lightgbm
!pip install matplotlib
!pip install numpy
!pip install pandas
!pip install IPython

In [None]:
!pip install graphviz
!pip install pygraphviz

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display, Image

In [None]:
import graphviz
import pygraphviz

In [None]:
# Load data from my work github
# load_file = 'https://raw.github.expedia.biz/landrejek/flight_ranking_demo/main/df_flights.csv?token=GHSAT0AAAAAAAAATMRA4RPOH6IYWLPGLXREZIGC2EQ'
# Load data from my personal github
# load_file = 'https://raw.githubusercontent.com/LukeAndrejek/flight_ranking_demo/main/df_flights.csv'
# Load data from current folder
load_file = 'df_flights.csv'

df_flights = pd.read_csv(load_file)
df_flights['carrier'] = df_flights['carrier'].astype('category')
df_flights.tail()

In [None]:
# View the bookings of a particular user
example_user_id = 336
df_flights[(df_flights['user_id'] == example_user_id) & (df_flights['booked'] == 1)].sort_values(by=['day'])

In [None]:
# Prepare training data
model_features = ['price', 'duration', 'layover_hours', 'legs', 'carrier', 'departure_hour', 'arrival_hour', 'recent_bookings_with_carrier']
df_train = df_flights[df_flights['day'] > 7]
X_train = df_train[model_features]
y_train = df_train['booked']
group_sizes_train = df_train.groupby('query_id', sort=False)['query_id'].count().to_numpy()

In [None]:
# Build model
model = (
    lgb.LGBMRanker(boosting_type = 'gbdt',
                   objective = 'lambdarank',
                   num_leaves = 10,
                   seed = 42)
    .fit(X=X_train,
         y=y_train,
         group=group_sizes_train)
)

In [None]:
# Append model score to dataset
df_flights_ranked = df_flights.copy(deep=True)
scores = model.predict(df_flights[model_features])
df_flights_ranked['score'] = scores

In [None]:
# Display the ranking of an example search
df_flights_ranked[df_flights_ranked['query_id'] == 1000].sort_values('score', ascending=False)

In [None]:
# Plot feature importances
lgb.plot_importance(model, importance_type = 'split')

In [None]:
# Plot feature importances
lgb.plot_importance(model, importance_type = 'gain')

In [None]:
# Plot impact of a particular feature on model score
plot_offer_row_index = 0
# Recall model_features = ['price', 'duration', 'layover_hours', 'legs', 'carrier', 'departure_hour', 'arrival_hour', 'recent_bookings_with_carrier']
plot_feature_index = 0

assert(df_train[model_features[plot_feature_index]].dtype != 'category')  # This code only plots numeric features

# Prepare feature data
model_features = model.feature_name_
plot_offer_row = X_train.iloc[[plot_offer_row_index]]
plot_feature =  model_features[plot_feature_index]
if plot_feature_index in [0, 1, 3]:
    max_x_value = 1
else:
    max_x_value = 24
domain = np.linspace(0,max_x_value,20)

# Plot result
plt.plot(domain,[model.predict(plot_offer_row.assign(**{plot_feature:x})) for x in domain])
plt.xlabel(plot_feature)
plt.ylabel('model score')
plt.grid(True)

In [None]:
# Plot a particular tree
tree_index = 0  # Set value to 8 to view a tree featuring recent_bookings_with_carrier

model_digraph = lgb.create_tree_digraph(model, tree_index=tree_index)
model_digraph.format = 'png'
png_path = model_digraph.render(filename='model_digraph')
# display(Image(png_path))