In [29]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.model_selection import train_test_split

In [30]:
# Load users with searches and products
df = pd.read_csv("./dataset/train_LFM.csv")[:6000]
train, test = train_test_split(df, test_size=0.2)
train = train
test = test
#test = pd.read_csv("./dataset/test_LFM.csv")

# Warning, not using relevant information that is relevant only wrt search AND property together such as the distance user-property, or better price than competitor...
# Not using: position, price_usd, orig_destination_distance, random_bool, booking_bool, comp_inv, comp_rate_percent_diff, interaction
user_src_feats = [
    "site_id", "visitor_location_country_id", "srch_destination_id", 
    "srch_length_of_stay", "srch_adults_count", "srch_children_count", "srch_room_count", 
    "srch_saturday_night_bool", #"srch_query_affinity_score"
    ]
prop_feats = ["prop_country_id", "prop_starrating", "prop_review_score", 
              "prop_brand_bool", "prop_location_score1", "prop_location_score2", 
              "prop_log_historical_price", "price_usd", "promotion_flag"
              ]

In [31]:
def get_prop_features(df):
    train_prop_features = []
    train_prop_ids = df["prop_id"].unique()

    for idx, prop_id in enumerate(train_prop_ids):
        if idx%5000 == 0:
            print("Prop percentage: "+str(100*idx/len(train_prop_ids))+"%")

        current_prop = df[df["prop_id"] == prop_id]
        features = {
            "prop_country_id" : current_prop["prop_country_id"].mean(),
            "prop_starrating" : current_prop["prop_starrating"].mean(),
            "prop_review_score" : current_prop["prop_review_score"].mean() if pd.notnull(current_prop["prop_review_score"].mean()) else train["prop_review_score"].mean(),
            "prop_brand_bool" : current_prop["prop_brand_bool"].mean(), 
            "prop_location_score1" : current_prop["prop_location_score1"].mean(), 
            "prop_location_score2" : current_prop["prop_location_score2"].mean() if pd.notnull(current_prop["prop_location_score2"].mean()) else train["prop_location_score2"].mean(),
            "prop_log_historical_price" : current_prop["prop_log_historical_price"].mean(), 
            "price_usd" : current_prop["price_usd"].mean(), 
            "promotion_flag" : current_prop["promotion_flag"].mean()
        }
        train_prop_features.append((prop_id, features))
    return train_prop_features
    
def get_srch_features(df):
    train_src_features = []
    train_src_ids = df["srch_id"].unique()

    for idx, src_id in enumerate(train_src_ids):
        if idx%5000 == 0:
            print("Srch percentage: "+str(100*idx/len(train_src_ids))+"%")

        current_src = df[df["srch_id"] == src_id]
        features = {
            "visitor_location_country_id" : current_src["visitor_location_country_id"].iloc[0], 
            "srch_destination_id" : current_src["srch_destination_id"].iloc[0],
            "srch_length_of_stay" : current_src["srch_length_of_stay"].iloc[0], 
            "srch_adults_count" : current_src["srch_adults_count"].iloc[0], 
            "srch_children_count" : current_src["srch_children_count"].iloc[0],
            "srch_room_count" : current_src["srch_room_count"].iloc[0], 
            "srch_saturday_night_bool" : current_src["srch_saturday_night_bool"].iloc[0], 
            #"srch_query_affinity_score" : current_src["srch_query_affinity_score"].iloc[0] if pd.notnull(current_src["srch_query_affinity_score"].iloc[0]) else current_src["srch_query_affinity_score"].mean()
        }
        train_src_features.append((src_id, features))
    return train_src_features

# Of course cannot be called since computes the available interaction, which is what we have to predict in the test dataset
def get_interactions(df):
    interaction_list = []
    interaction_df = df.groupby(["srch_id", "prop_id"])["interaction"].first().reset_index()
    for tuple in interaction_df.itertuples():
        interaction_list.append((tuple.srch_id, tuple.prop_id, tuple.interaction))
    return interaction_list


In [49]:
# Pack train set for training
prop_features = get_prop_features(df)
print("Prop features computed")
srch_features = get_srch_features(df)
print("Srch features computed")
interaction_list = get_interactions(train)
print("Interactions list computed")

Prop percentage: 0.0%
Prop percentage: 93.00595238095238%
Prop features computed
Srch percentage: 0.0%
Srch features computed
Interactions list computed


In [51]:
# Extract feature names
srch_feature_names = set()
for _, features in srch_features:
    srch_feature_names.update(features.keys())

prop_feature_names = set()
for _, features in prop_features:
    prop_feature_names.update(features.keys())

KeyboardInterrupt: 

In [39]:
# Initialize the dataset
dataset = Dataset()

# Fit the dataset with user and item ids and features
dataset.fit(
    users=df["srch_id"].unique(),
    items=df["prop_id"].unique(),
    user_features=[
        "visitor_location_country_id",
        "srch_destination_id",
        "srch_length_of_stay",
        "srch_adults_count",
        "srch_children_count",
        "srch_room_count",
        "srch_saturday_night_bool",
        #"srch_query_affinity_score",
    ],
    item_features=[
        "prop_country_id",
        "prop_starrating",
        "prop_review_score",
        "prop_brand_bool",
        "prop_location_score1",
        "prop_location_score2",
        "prop_log_historical_price",
        "price_usd",
        "promotion_flag",
    ],
)

(interactions, interactions_weights) = dataset.build_interactions(interaction_list)
src_feat_matrix = dataset.build_user_features(srch_features)
prop_feat_matrix = dataset.build_item_features(prop_features)


KeyboardInterrupt: 

In [35]:
# Initialize and train the model
latent_feats = 50
training_epochs = 30

model = LightFM(loss='warp')
model.fit(interactions, user_features=src_feat_matrix, item_features=prop_feat_matrix, epochs=training_epochs, num_threads=2, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


<lightfm.lightfm.LightFM at 0x17fc07ac0>

In [40]:
# Now to the prediction

# Ensure user_ids and item_ids for the test set are available
test_srch_ids = test["srch_id"].unique()
test_prop_ids = test["prop_id"].unique()

# test_src_feat_matrix = dataset.build_user_features(test_srch_features)
# test_prop_feat_matrix = dataset.build_item_features(test_prop_features)
# interactions, interaction_weights = dataset.build_interactions([])

# model.fit_partial(interactions, user_features=test_src_feat_matrix, item_features=test_prop_feat_matrix, epochs=training_epochs)

srch_to_predict = []
prop_to_predict = []
# prediction_srch_features = []
# prediction_prop_features = []

# Get all the tuples of interactions to predict
for idx, tup in enumerate(test.itertuples()):
    if idx%100 == 0: 
        print(str(100*idx/len(test))+"%")

    srch_to_predict.append(tup.srch_id)
    # for id, feats in test_srch_features:
    #     if tup.srch_id == id:
    #         prediction_srch_features.append((id, feats))
    #         break

    prop_to_predict.append(tup.prop_id)
    # for id, feats in test_prop_features:
    #     if tup.prop_id == id:
    #         prediction_prop_features.append((id, feats))
    #         break

print("Prediction features and ids ready")

predictions = model.predict(np.array([158]), np.array([13878]), user_features=src_feat_matrix, item_features=prop_feat_matrix)

# Predict the score
#predictions = model.predict(np.array(srch_to_predict), np.array(prop_to_predict), 
#                   user_features=test_src_feat_matrix, item_features=test_prop_feat_matrix)

print(predictions)

# Loop through each user and item pair in the test set
#for srch_id, prop_id in to_predict:
#    # Ensure the user and item features are in the correct format
#    for id, features in test_srch_features:
#        if id == srch_id:
#            srch_feats = list(features.values())
#            break
#
#    for id, features in test_prop_features:
#        if id == prop_id:
#            prop_feats = list(features.values())
#            break
#    
#    test_src_feat_matrix = dataset.build_user_features(test_srch_features)
#    test_prop_feat_matrix = dataset.build_item_features(test_prop_features)
#
#    # Predict the score
#    score = model.predict(np.array([srch_id]), np.array([prop_id]), 
#                          user_features=srch_feats, item_features=prop_feats)
#    predictions.append((srch_id, prop_id, score[0]))
#
## Convert predictions to a DataFrame for easy viewing
#predictions_df = pd.DataFrame(predictions, columns=["srch_id", "prop_id", "score"])
#
## Save predictions to a CSV file
#predictions_df.to_csv("predictions.csv", index=False)
#
## Example: Display the first few predictions
#print(predictions_df.head())


0.0%
8.333333333333334%
16.666666666666668%
25.0%
33.333333333333336%
41.666666666666664%
50.0%
58.333333333333336%
66.66666666666667%
75.0%
83.33333333333333%
91.66666666666667%
Prediction features and ids ready


Exception: Number of item feature rows does not equal the number of items

In [None]:

# Evaluating the model
from lightfm.evaluation import precision_at_k, auc_score

train_precision = precision_at_k(model, interactions, k=5, user_features=user_features_matrix, item_features=item_features_matrix).mean()
train_auc = auc_score(model, interactions, user_features=user_features_matrix, item_features=item_features_matrix).mean()

print(f'Train precision at k: {train_precision:.2f}')
print(f'Train AUC score: {train_auc:.2f}')


In [52]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset

# Example data
users = [1, 2, 3, 4]
items = [1, 2, 3, 4, 5]
interactions = [
    (1, 1, 1.0),
    (1, 2, 1.0),
    (2, 2, 1.0),
    (2, 3, 1.0),
    (3, 3, 1.0),
    (3, 4, 1.0),
    (4, 4, 1.0),
    (4, 5, 1.0)
]

user_features = {
    1: {"age": 23, "income": 50000},
    2: {"age": 45, "income": 70000},
    3: {"age": 34, "income": 65000},
    4: {"age": 28, "income": 48000}
}

item_features = {
    1: {"price": 20.0, "popularity": 0.9},
    2: {"price": 35.0, "popularity": 0.7},
    3: {"price": 50.0, "popularity": 0.8},
    4: {"price": 15.0, "popularity": 0.6},
    5: {"price": 45.0, "popularity": 0.9}
}

# Convert user and item features to lists of tuples with proper formatting
user_features_list = [(user, [f"age:{feature_dict['age']}", f"income:{feature_dict['income']}"]) for user, feature_dict in user_features.items()]
item_features_list = [(item, [f"price:{feature_dict['price']}", f"popularity:{feature_dict['popularity']}"]) for item, feature_dict in item_features.items()]

# Extract feature names
user_feature_names = set()
for features in user_features.values():
    user_feature_names.update(features.keys())

item_feature_names = set()
for features in item_features.values():
    item_feature_names.update(features.keys())
# Initialize the dataset
dataset = Dataset()

# Fit the dataset with user and item ids and all possible feature names
dataset.fit(
    users=users,
    items=items,
    user_features=[f"age:{i}" for i in range(100)] + [f"income:{i}" for i in range(100000)],
    item_features=[f"price:{i:.1f}" for i in np.arange(0, 100, 0.1)] + [f"popularity:{i:.1f}" for i in np.arange(0, 1.1, 0.1)]
)

# Build the interactions matrix
(interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in interactions])

# Build the user and item features matrices
user_features_matrix = dataset.build_user_features(user_features_list)
item_features_matrix = dataset.build_item_features(item_features_list)

# Initialize and train the model
model = LightFM(loss='warp')
model.fit(interactions, user_features=user_features_matrix, item_features=item_features_matrix, epochs=30, num_threads=2)

# Predicting the score for a user-item pair
user_id = 1
item_id = 3
score = model.predict(np.array([user_id]), np.array([item_id]), user_features=user_features_matrix, item_features=item_features_matrix)
print(f"Predicted score for user {user_id} and item {item_id}: {score[0]}")

# Evaluation
train_precision = precision_at_k(model, interactions, k=5, user_features=user_features_matrix, item_features=item_features_matrix).mean()
train_auc = auc_score(model, interactions, user_features=user_features_matrix, item_features=item_features_matrix).mean()

print(f'Train precision at k: {train_precision:.2f}')
print(f'Train AUC score: {train_auc:.2f}')


KeyboardInterrupt: 