# Model Building using LightFM Library:

## Prepare Dataset:

### Load the dataset: 

In [4]:
from lightfm import LightFM

INTERIM_DATA_DIR = "../data/interim/"

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

merged_data = pd.read_csv(f"{INTERIM_DATA_DIR}merged.csv")

# Encode categorical features like 'gender' and 'occupation' using label encoding
gender_encoder = LabelEncoder()
occupation_encoder = LabelEncoder()

merged_data['gender'] = gender_encoder.fit_transform(merged_data['gender'])
merged_data['occupation'] = occupation_encoder.fit_transform(
    merged_data['occupation'])

# Create a dataset for LightFM
# For users and items, only their ids are needed
user_ids = merged_data['user_id'].unique()
item_ids = merged_data['movie_id'].unique()

# Create the feature lists for users and items
user_features = merged_data[['user_id', 'age',
                             'gender', 'occupation', 'zip_code']].drop_duplicates()
item_features = merged_data[['movie_id', 'movie_name', 'action', 'adventure', 'animation', "children's", 'comedy', 'crime', 'documentary', 'drama',
                             'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western', 'release_year']].drop_duplicates()

# Convert features to a list of strings, as required by LightFM
user_features['features'] = user_features.apply(lambda x: [str(
    x['age']), str(x['gender']), str(x['occupation']), x['zip_code']], axis=1)
item_features['features'] = item_features.apply(
    lambda x: [x['movie_name']] + list(x['action':'release_year'].astype(str)), axis=1)

item_features = item_features[[
    'movie_id', 'movie_name', 'release_year', 'features']]
# Extract the interaction data
interaction_data = merged_data[['user_id', 'movie_id', 'rating']]

In [6]:
item_features.loc[0, 'features']

['Kolya',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1997']

In [7]:
interaction_data

Unnamed: 0,user_id,movie_id,rating
0,196,242,0.4
1,305,242,0.8
2,6,242,0.6
3,234,242,0.6
4,63,242,0.4
...,...,...,...
99985,863,1679,0.4
99986,863,1678,0.0
99987,863,1680,0.2
99988,896,1681,0.4


In [8]:
interaction_data.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,0.4
1,305,242,0.8
2,6,242,0.6
3,234,242,0.6
4,63,242,0.4


In [9]:
user_features.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,features
0,196,49,1,20,55105,"[49, 1, 20, 55105]"
1,305,23,1,14,94086,"[23, 1, 14, 94086]"
2,6,42,1,6,98101,"[42, 1, 6, 98101]"
3,234,60,1,15,94702,"[60, 1, 15, 94702]"
4,63,31,1,11,75240,"[31, 1, 11, 75240]"


In [10]:
item_features.head()

Unnamed: 0,movie_id,movie_name,release_year,features
0,242,Kolya,1997,"[Kolya, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,..."
117,393,Mrs. Doubtfire,1993,"[Mrs. Doubtfire, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,..."
309,381,Muriel's Wedding,1994,"[Muriel's Wedding, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
409,251,Shall We Dance?,1997,"[Shall We Dance?, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0..."
455,655,Stand by Me,1986,"[Stand by Me, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,..."


### Fit a LightFM dataset into our dataset

In [11]:
from sklearn.model_selection import train_test_split
from lightfm import LightFM
from lightfm.data import Dataset
import numpy as np

# Initialize the LightFM dataset
dataset = Dataset()

# Fit the dataset to the users and items
dataset.fit(
    users=user_ids,
    items=item_ids,
    user_features=user_features['features'].explode(),
    item_features=item_features['features'].explode())

# Split the data
train_data, test_data = train_test_split(
    interaction_data, test_size=0.2, random_state=42)

# Build the interaction matrices for training and testing
(interactions_train, _) = dataset.build_interactions(
    [(row['user_id'], row['movie_id'], row['rating']) for idx, row in train_data.iterrows()])
(interactions_test, _) = dataset.build_interactions(
    [(row['user_id'], row['movie_id'], row['rating']) for idx, row in test_data.iterrows()])

# Build user and item feature matrices
user_features_matrix = dataset.build_user_features(
    [(row['user_id'], row['features']) for idx, row in user_features.iterrows()], normalize=False)
item_features_matrix = dataset.build_item_features(
    [(row['movie_id'], row['features']) for idx, row in item_features.iterrows()], normalize=False)

## Train LightFM Model:

Here, we're using Grid Search over these hyper-parameters:
- Number of Components.
- Learning Rate.
- Epochs.

Then we're calculating the F1-Score, and based on the best F1-Score, we pick the best combination for the model

In [9]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

# Define a range of hyperparameters for tuning
n_components_options = [5, 10, 20, 40]
learning_rate_options = [0.001, 0.01]
epochs_options = [5, 10, 50]

best_f1score = 0
best_recall = 0
best_precision = 0
best_params = {}

# Grid search
for n_components in n_components_options:
    for learning_rate in learning_rate_options:
        for epoch in epochs_options:
            model = LightFM(loss='warp', no_components=n_components,
                            learning_rate=learning_rate)
            model.fit(interactions_train, user_features=user_features_matrix,
                      item_features=item_features_matrix, epochs=epoch)
            
            # Calculate scores
            precision = precision_at_k(
                model, interactions_test, user_features=user_features_matrix, item_features=item_features_matrix, k=10).mean()
            recall = recall_at_k(model, interactions_test, user_features=user_features_matrix,
                                 item_features=item_features_matrix, k=10).mean()
            f1score = 2*precision*recall/(precision+recall)
            
            # Pick best score & update best parameters
            if f1score > best_f1score:
                best_f1score = f1score
                best_precision = precision
                best_recall = recall
                best_params = {'n_components': n_components,
                               'learning_rate': learning_rate, 'epochs': epoch}
print(f"Best Parameters: {best_params}")

### Create & fit the model with best parameters

In [10]:
model = LightFM(loss='warp', no_components=best_params['n_components'],
                learning_rate=best_params['learning_rate'])
model.fit(interactions_train, user_features=user_features_matrix,
          item_features=item_features_matrix, epochs=best_params['epochs'])

auc = auc_score(model, interactions_test, user_features=user_features_matrix,
                item_features=item_features_matrix).mean()

## Save & Load the model checkpoint:

#### Save:

In [13]:
import pickle

MODEL_CHECKPOINT_PATH = "../models/CheckpointLightFM.pickle"

with open(MODEL_CHECKPOINT_PATH, 'wb') as p:
    pickle.dump(model, p, protocol=pickle.HIGHEST_PROTOCOL)

#### Load:

In [1]:
import pickle
from lightfm import LightFM

MODEL_CHECKPOINT_PATH = "../models/CheckpointLightFM.pickle"

model = pickle.load(open(MODEL_CHECKPOINT_PATH, 'rb'))



## Evaluate:

In [15]:
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

auc = auc_score(model, interactions_test, user_features=user_features_matrix,
                item_features=item_features_matrix).mean()

precision = precision_at_k(
    model, interactions_test, user_features=user_features_matrix, item_features=item_features_matrix, k=10).mean()

recall = recall_at_k(model, interactions_test, user_features=user_features_matrix,
                        item_features=item_features_matrix, k=10).mean()

f1score = 2*precision*recall/(precision+recall)

print(f"F1-Score: {f1score:0.2}")
print(f"Area Under The Curve Score: {auc:0.2}")
print(f"Precision: {precision:0.2}")
print(f"Recall: {recall:0.2}")

F1-Score: 0.12
Area Under The Curve Score: 0.91
Precision: 0.13
Recall: 0.11


## Make recommendations:

In [2]:
def sample_recommendation(user_id, model, dataset, item_features, n_items=10):
    # Obtain the internal index for the user
    internal_user_id = dataset.mapping()[0].get(user_id, None)
    if internal_user_id is None:
        raise ValueError(f"User ID {user_id} not found in dataset")

    # Known positives: the items the user has already interacted with
    known_positives = interaction_data[interaction_data['user_id']
                                       == user_id]['movie_id'].values
    internal_known_positives = [dataset.mapping()[2].get(
        x) for x in known_positives if x in dataset.mapping()[2]]

    # Movies our model predicts they will like (filtering out the known positives)
    total_items = len(dataset.mapping()[2])  # Total number of items
    all_items = np.array([i for i in range(total_items)
                         if i not in internal_known_positives])
    scores = model.predict(internal_user_id, all_items,
                           user_features=user_features_matrix, item_features=item_features_matrix)

    # Rescale the scores to a 1-5 range
    min_score = scores.min()
    max_score = scores.max()
    rescaled_scores = 1 + (scores - min_score) * 4 / (max_score - min_score)

    # Rank items by rescaled predicted scores and select top N
    top_items_indices = np.argsort(-rescaled_scores)[:n_items]
    top_items_scores = rescaled_scores[top_items_indices]
    top_items = all_items[top_items_indices]

    # Convert internal item indices back to movie IDs
    external_top_items = [list(dataset.mapping()[2].keys())[list(
        dataset.mapping()[2].values()).index(i)] for i in top_items]

    # Map movie IDs to names and include predicted ratings
    top_movies = item_features[item_features['movie_id'].isin(
        external_top_items)][['movie_id', 'movie_name']]
    top_movies['predicted_rating'] = top_movies['movie_id'].map(
        dict(zip(external_top_items, top_items_scores))).round(2)

    top_movies_sorted = top_movies.sort_values(
        by='predicted_rating', ascending=False)

    return top_movies_sorted


In [12]:
sample_user_id = int(input('Please enter the user ID to recommend movies: '))

top_movies_with_ratings = sample_recommendation(
    sample_user_id, model, dataset, item_features, n_items=10)
print(top_movies_with_ratings)

       movie_id                   movie_name  predicted_rating
66135       137                    Big Night              5.00
52504       276            Leaving Las Vegas              4.95
9266        237                Jerry Maguire              4.91
51028         9             Dead Man Walking              4.89
65948       124                    Lone Star              4.89
68307        15           Mr. Holland's Opus              4.89
39353       508  People vs. Larry Flynt, The              4.83
40908       475                Trainspotting              4.82
50090       126          Spitfire Grill, The              4.68
18878       294                    Liar Liar              4.67
