In [25]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.model_selection import train_test_split

In [26]:
# Load users with searches and products
df = pd.read_csv("./dataset/train_LFM.csv")
train, test = train_test_split(df, test_size=0.2)
train = train[:3000]
test = test[:3000]
#test = pd.read_csv("./dataset/test_LFM.csv")

# Warning, not using relevant information that is relevant only wrt search AND property together such as the distance user-property, or better price than competitor...
# Not using: position, price_usd, orig_destination_distance, random_bool, booking_bool, comp_inv, comp_rate_percent_diff, interaction
user_src_feats = [
    "site_id", "visitor_location_country_id", "srch_destination_id", 
    "srch_length_of_stay", "srch_adults_count", "srch_children_count", "srch_room_count", 
    "srch_saturday_night_bool", #"srch_query_affinity_score"
    ]
prop_feats = ["prop_country_id", "prop_starrating", "prop_review_score", 
              "prop_brand_bool", "prop_location_score1", "prop_location_score2", 
              "prop_log_historical_price", "price_usd", "promotion_flag"
              ]

In [28]:
def get_prop_features(df):
    train_prop_features = []
    train_prop_ids = df["prop_id"].unique()

    for idx, prop_id in enumerate(train_prop_ids):
        if idx%5000 == 0:
            print("Prop percentage: "+str(100*idx/len(train_prop_ids))+"%")

        current_prop = df[df["prop_id"] == prop_id]
        features = {
            "prop_country_id" : current_prop["prop_country_id"].mean(),
            "prop_starrating" : current_prop["prop_starrating"].mean(),
            "prop_review_score" : current_prop["prop_review_score"].mean() if pd.notnull(current_prop["prop_review_score"].mean()) else train["prop_review_score"].mean(),
            "prop_brand_bool" : current_prop["prop_brand_bool"].mean(), 
            "prop_location_score1" : current_prop["prop_location_score1"].mean(), 
            "prop_location_score2" : current_prop["prop_location_score2"].mean() if pd.notnull(current_prop["prop_location_score2"].mean()) else train["prop_location_score2"].mean(),
            "prop_log_historical_price" : current_prop["prop_log_historical_price"].mean(), 
            "price_usd" : current_prop["price_usd"].mean(), 
            "promotion_flag" : current_prop["promotion_flag"].mean()
        }
        train_prop_features.append((prop_id, features))
    return train_prop_features
    
def get_srch_features(df):
    train_src_features = []
    train_src_ids = df["srch_id"].unique()

    for idx, src_id in enumerate(train_src_ids):
        if idx%5000 == 0:
            print("Srch percentage: "+str(100*idx/len(train_src_ids))+"%")

        current_src = df[df["srch_id"] == src_id]
        features = {
            "visitor_location_country_id" : current_src["visitor_location_country_id"].iloc[0], 
            "srch_destination_id" : current_src["srch_destination_id"].iloc[0],
            "srch_length_of_stay" : current_src["srch_length_of_stay"].iloc[0], 
            "srch_adults_count" : current_src["srch_adults_count"].iloc[0], 
            "srch_children_count" : current_src["srch_children_count"].iloc[0],
            "srch_room_count" : current_src["srch_room_count"].iloc[0], 
            "srch_saturday_night_bool" : current_src["srch_saturday_night_bool"].iloc[0], 
            #"srch_query_affinity_score" : current_src["srch_query_affinity_score"].iloc[0] if pd.notnull(current_src["srch_query_affinity_score"].iloc[0]) else current_src["srch_query_affinity_score"].mean()
        }
        train_src_features.append((src_id, features))
    return train_src_features

# Of course cannot be called since computes the available interaction, which is what we have to predict in the test dataset
def get_interactions(df):
    interaction_list = []
    interaction_df = df.groupby(["srch_id", "prop_id"])["interaction"].first().reset_index()
    for tuple in interaction_df.itertuples():
        interaction_list.append((tuple.srch_id, tuple.prop_id, tuple.interaction))
    return interaction_list


In [29]:
# Pack train set for training
train_prop_features = get_prop_features(train)
print("Prop features computed")
train_srch_features = get_srch_features(train)
print("Srch features computed")
interaction_list = get_interactions(train)
print("Interactions list computed")

Prop percentage: 0.0%
Prop features computed
Srch percentage: 0.0%
Srch features computed
Interactions list computed


In [30]:
# Packing the test set for evaluation
test_prop_features = get_prop_features(test)
print("Prop features computed")
test_srch_features = get_srch_features(test)
print("Srch features computed")

Prop percentage: 0.0%
Prop features computed
Srch percentage: 0.0%
Srch features computed


In [36]:
# Initialize the dataset
dataset = Dataset()

# Fit the dataset with user and item ids and features
dataset.fit(
    users=df["srch_id"].unique(),
    items=df["prop_id"].unique(),
    user_features=[
        "visitor_location_country_id",
        "srch_destination_id",
        "srch_length_of_stay",
        "srch_adults_count",
        "srch_children_count",
        "srch_room_count",
        "srch_saturday_night_bool",
        #"srch_query_affinity_score",
    ],
    item_features=[
        "prop_country_id",
        "prop_starrating",
        "prop_review_score",
        "prop_brand_bool",
        "prop_location_score1",
        "prop_location_score2",
        "prop_log_historical_price",
        "price_usd",
        "promotion_flag",
    ],
)

(interactions, interactions_weights) = dataset.build_interactions(interaction_list)
train_src_feat_matrix = dataset.build_user_features(train_srch_features)
train_prop_feat_matrix = dataset.build_item_features(train_prop_features)


In [32]:
# Initialize and train the model
latent_feats = 50
training_epochs = 30

model = LightFM(loss='warp', no_components=latent_feats)
model.fit(interactions, user_features=train_src_feat_matrix, item_features=train_prop_feat_matrix, epochs=training_epochs, num_threads=8, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


<lightfm.lightfm.LightFM at 0x35266a3d0>

In [43]:
# Now to the prediction

# Ensure user_ids and item_ids for the test set are available
test_srch_ids = test["srch_id"].unique()
test_prop_ids = test["prop_id"].unique()

# test_src_feat_matrix = dataset.build_user_features(test_srch_features)
# test_prop_feat_matrix = dataset.build_item_features(test_prop_features)
# interactions, interaction_weights = dataset.build_interactions([])

# model.fit_partial(interactions, user_features=test_src_feat_matrix, item_features=test_prop_feat_matrix, epochs=training_epochs)

srch_to_predict = []
prop_to_predict = []
prediction_srch_features = []
prediction_prop_features = []

# Get all the tuples of interactions to predict
for idx, tup in enumerate(test.itertuples()):
    if idx%100 == 0: 
        print(str(idx/len(test))+"%")

    srch_to_predict.append(tup.srch_id)
    for id, feats in test_srch_features:
        if tup.srch_id == id:
            prediction_srch_features.append((id, feats))
            break

    prop_to_predict.append(tup.prop_id)
    for id, feats in test_prop_features:
        if tup.prop_id == id:
            prediction_prop_features.append((id, feats))
            break

print("Prediction features and ids ready")



################
# Initialize the dataset
tmp_dataset = Dataset()

# Fit the dataset with user and item ids and features
tmp_dataset.fit(
    users=test_srch_ids,
    items=test_prop_ids,
    user_features=[
        "visitor_location_country_id",
        "srch_destination_id",
        "srch_length_of_stay",
        "srch_adults_count",
        "srch_children_count",
        "srch_room_count",
        "srch_saturday_night_bool",
        #"srch_query_affinity_score",
    ],
    item_features=[
        "prop_country_id",
        "prop_starrating",
        "prop_review_score",
        "prop_brand_bool",
        "prop_location_score1",
        "prop_location_score2",
        "prop_log_historical_price",
        "price_usd",
        "promotion_flag",
    ],
)
################

test_src_feat_matrix = tmp_dataset.build_user_features(prediction_srch_features)
test_prop_feat_matrix = tmp_dataset.build_item_features(prediction_prop_features)

# Predict the score
predictions = model.predict(np.array(srch_to_predict), np.array(prop_to_predict), 
                    user_features=test_src_feat_matrix, item_features=test_prop_feat_matrix)

print(predictions)

# Loop through each user and item pair in the test set
#for srch_id, prop_id in to_predict:
#    # Ensure the user and item features are in the correct format
#    for id, features in test_srch_features:
#        if id == srch_id:
#            srch_feats = list(features.values())
#            break
#
#    for id, features in test_prop_features:
#        if id == prop_id:
#            prop_feats = list(features.values())
#            break
#    
#    test_src_feat_matrix = dataset.build_user_features(test_srch_features)
#    test_prop_feat_matrix = dataset.build_item_features(test_prop_features)
#
#    # Predict the score
#    score = model.predict(np.array([srch_id]), np.array([prop_id]), 
#                          user_features=srch_feats, item_features=prop_feats)
#    predictions.append((srch_id, prop_id, score[0]))
#
## Convert predictions to a DataFrame for easy viewing
#predictions_df = pd.DataFrame(predictions, columns=["srch_id", "prop_id", "score"])
#
## Save predictions to a CSV file
#predictions_df.to_csv("predictions.csv", index=False)
#
## Example: Display the first few predictions
#print(predictions_df.head())


0.0%
0.03333333333333333%
0.06666666666666667%
0.1%
0.13333333333333333%
0.16666666666666666%
0.2%
0.23333333333333334%
0.26666666666666666%
0.3%
0.3333333333333333%
0.36666666666666664%
0.4%
0.43333333333333335%
0.4666666666666667%
0.5%
0.5333333333333333%
0.5666666666666667%
0.6%
0.6333333333333333%
0.6666666666666666%
0.7%
0.7333333333333333%
0.7666666666666667%
0.8%
0.8333333333333334%
0.8666666666666667%
0.9%
0.9333333333333333%
0.9666666666666667%
Prediction features and ids ready


In [62]:
from scipy.sparse import csr_matrix

# Predicting the score for a user-item pair
#user_id = 1
#prop_id = 3309
#
#user_feats = None
#for key, val in train_src_features:
#    if key == user_id:
#        user_feats = list(val.values())
#        break
#
#prop_feats = None
#for key, val in train_prop_features:
#    if key == prop_id:
#        prop_feats = list(val.values())
#        break

# Convert feature lists to sparse matrices
# user_feats_matrix = csr_matrix(user_feats).reshape(1, -1)
# prop_feats_matrix = csr_matrix(prop_feats).reshape(1, -1)

#current_user_feats = [user_id, {
#        "visitor_location_country_id":user_feats[0],
#        "srch_destination_id":user_feats[1],
#        "srch_length_of_stay":user_feats[2],
#        "srch_adults_count":user_feats[3],
#        "srch_children_count":user_feats[4],
#        "srch_room_count":user_feats[5],
#        "srch_saturday_night_bool":user_feats[6],
#        #"srch_query_affinity_score",
#    }]
#current_user_feats = dataset.build_user_features([current_user_feats])

#score = model.predict(user_ids=np.array([user_id]), item_ids=np.array([prop_id]), user_features=current_user_feats)
#print(f"Predicted score for user {user_id} and item {user_id}: {score[0]}")


In [None]:

# Evaluating the model
from lightfm.evaluation import precision_at_k, auc_score

train_precision = precision_at_k(model, interactions, k=5, user_features=user_features_matrix, item_features=item_features_matrix).mean()
train_auc = auc_score(model, interactions, user_features=user_features_matrix, item_features=item_features_matrix).mean()

print(f'Train precision at k: {train_precision:.2f}')
print(f'Train AUC score: {train_auc:.2f}')
