# Introduction
LightFM is a Python library designed for building and training recommendation models that integrate both collaborative filtering and matrix factorization techniques. It's particularly useful for recommending items (such as movies, products, or articles) to users based on their past interactions or preferences.

In this notebook we'll focus on using the advanced model (Factorization Machines with LightFM) for the user recommender and the cosine similarity for item similarity.

# Setup

In [93]:
!pip install streamlit pyngrok

Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


In [1]:
!pip install lightfm streamlit

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/316.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m307.2/316.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [79]:
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset as LightFMDataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score
import joblib
from sklearn.metrics.pairwise import cosine_similarity

# Load Data

In [3]:
# Load the MovieLens dataset
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
dataset_path = '/content/ml-latest-small.zip'

# Download and unzip the dataset
!wget -nc $url -O $dataset_path
!unzip -n $dataset_path -d /content/



--2024-06-30 01:12:09--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘/content/ml-latest-small.zip’


2024-06-30 01:12:09 (6.65 MB/s) - ‘/content/ml-latest-small.zip’ saved [978202/978202]

Archive:  /content/ml-latest-small.zip
   creating: /content/ml-latest-small/
  inflating: /content/ml-latest-small/links.csv  
  inflating: /content/ml-latest-small/tags.csv  
  inflating: /content/ml-latest-small/ratings.csv  
  inflating: /content/ml-latest-small/README.txt  
  inflating: /content/ml-latest-small/movies.csv  


In [14]:
# Load data into pandas dataframes
ratings = pd.read_csv('/content/ml-latest-small/ratings.csv')
movies = pd.read_csv('/content/ml-latest-small/movies.csv')

# Exploring Data

In [26]:
# Display the first few rows of the ratings dataset
print("Ratings Data:")
print(ratings.head())

Ratings Data:
   userId  movieId  rating   timestamp             title  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1  Adventure|Animation|Children|Comedy|Fantasy  
2  Adventure|Animation|Children|Comedy|Fantasy  
3  Adventure|Animation|Children|Comedy|Fantasy  
4  Adventure|Animation|Children|Comedy|Fantasy  


In [27]:
# Display the first few rows of the movies dataset
print("\nMovies Data:")
print(movies.head())



Movies Data:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [28]:
# Check for missing values
print("\nMissing Values in Ratings:")
print(ratings.isnull().sum())


Missing Values in Ratings:
userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64


In [29]:
print("\nMissing Values in Movies:")
print(movies.isnull().sum())


Missing Values in Movies:
movieId    0
title      0
genres     0
dtype: int64


In [30]:

# Basic statistics
print("\nBasic Statistics for Ratings:")
print(ratings.describe())


Basic Statistics for Ratings:
              userId        movieId         rating     timestamp
count  100836.000000  100836.000000  100836.000000  1.008360e+05
mean      326.127564   19435.295718       3.501557  1.205946e+09
std       182.618491   35530.987199       1.042529  2.162610e+08
min         1.000000       1.000000       0.500000  8.281246e+08
25%       177.000000    1199.000000       3.000000  1.019124e+09
50%       325.000000    2991.000000       3.500000  1.186087e+09
75%       477.000000    8122.000000       4.000000  1.435994e+09
max       610.000000  193609.000000       5.000000  1.537799e+09


In [15]:
# Merge the datasets to get movie titles in the ratings DataFrame
ratings = ratings.merge(movies, on='movieId')

In [5]:
# Create a user-item matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='title', values='rating')

In [6]:
# Fill missing values with 0 for item similarity calculations
user_item_matrix_filled = user_item_matrix.fillna(0)

#  Prepare Data for LightFM

In [49]:
# Create a LightFM dataset
lfm_dataset = LightFMDataset()
lfm_dataset.fit(ratings['userId'], ratings['movieId'])

# Train the User Recommender Model (LightFM)

In [55]:
# Build interactions
interactions, _ = lfm_dataset.build_interactions(((row['userId'], row['movieId']) for index, row in ratings.iterrows()))

# Split interactions into train and test sets
train_interactions, test_interactions =  random_train_test_split(interactions, test_percentage=0.2, random_state=42)


In [66]:
# Initialize LightFM model
model = LightFM(loss='warp')  # WARP: Weighted Approximate-Rank Pairwise

# Train the model
model.fit(train_interactions, epochs=20, num_threads=4)

<lightfm.lightfm.LightFM at 0x7d5e0b070100>

In [67]:
# Evaluate the model (example: AUC score)
train_auc = auc_score(model, train_interactions).mean()
test_auc = auc_score(model, test_interactions).mean()

print(f'Train AUC Score: {train_auc}')
print(f'Test AUC Score: {test_auc}')

Train AUC Score: 0.9607337713241577
Test AUC Score: 0.921898603439331


## Hyperparameter Tuning

In [72]:
model_2 = LightFM(loss='warp', learning_rate=0.05, item_alpha=0.01)
model_2.fit(train_interactions, epochs=40, num_threads=4)

<lightfm.lightfm.LightFM at 0x7d5e0b0709d0>

In [73]:
# Evaluate the model (example: AUC score)
train_auc = auc_score(model_2, train_interactions).mean()
test_auc = auc_score(model_2, test_interactions).mean()

print(f'Train AUC Score: {train_auc}')
print(f'Test AUC Score: {test_auc}')

Train AUC Score: 0.9289810657501221
Test AUC Score: 0.8813544511795044


# Save The Model (LightFM)

In [78]:
# Save embeddings
joblib.dump(model.user_embeddings, 'user_embeddings.pkl')
joblib.dump(model.item_embeddings, 'item_embeddings.pkl')

# Load embeddings and deploy model
user_embeddings = joblib.load('user_embeddings.pkl')
item_embeddings = joblib.load('item_embeddings.pkl')

# Reinitialize model for deployment
deployed_model = LightFM(loss='warp')
deployed_model.user_embeddings = user_embeddings
deployed_model.item_embeddings = item_embeddings

# Function to calculate item similarities (cosine similarity)

In [80]:
def calculate_item_similarity(item_id, all_item_embeddings):
    item_embedding = all_item_embeddings[item_id]
    similarities = cosine_similarity([item_embedding], all_item_embeddings)[0]
    similar_item_ids = similarities.argsort()[::-1][1:11]  # Exclude the item itself
    return similar_item_ids

In [81]:
item_id = 1
similar_items = calculate_item_similarity(item_id, item_embeddings)
print(f'Top 10 similar items to item {item_id}: {similar_items}')


Top 10 similar items to item 1: [4054  703  524  625  707  687 2555 2913 2661  680]


-----------------------------------------




## Deployment

In [94]:
import streamlit as st
from pyngrok import ngrok

In [83]:
# Streamlit app
st.title('Movie Recommendation System')

2024-06-30 03:25:43.133 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [84]:
# Function to recommend movies for a user
def recommend_movies(user_id):
    recommended_movies = deployed_model.predict(user_id, range(deployed_model.item_embeddings.shape[0]))
    movie_ids = sorted(range(len(recommended_movies)), key=lambda i: recommended_movies[i], reverse=True)[:10]
    movie_titles = [movies.iloc[movie_id]['title'] for movie_id in movie_ids]
    return movie_titles


In [85]:
# Sidebar for user input
user_id = st.sidebar.number_input('Enter User ID', min_value=1, max_value=610, value=1)

2024-06-30 03:26:26.683 Session state does not function when running a script without `streamlit run`


In [86]:
# Display recommended movies for the user
if st.sidebar.button('Get Recommendations'):
    recommended_movies = recommend_movies(user_id)
    st.subheader('Recommended Movies:')
    for i, movie in enumerate(recommended_movies):
        st.write(f'{i+1}. {movie}')

In [87]:
# Function to show similar movies
def show_similar_movies(item_id):
    similar_items = calculate_item_similarity(item_id, item_embeddings)
    similar_movies = [movies.iloc[item_id]['title'] for item_id in similar_items]
    return similar_movies

In [88]:
# Display similar movies for a selected item
item_id = st.sidebar.number_input('Enter Item ID', min_value=1, max_value=193609, value=1)
if st.sidebar.button('Show Similar Movies'):
    similar_movies = show_similar_movies(item_id)
    st.subheader(f'Similar Movies to {movies.iloc[item_id]["title"]}:')
    for i, movie in enumerate(similar_movies):
        st.write(f'{i+1}. {movie}')

In [96]:
! ngrok config add-authtoken 2iaArv55s8RIHNm3hV5zjNRD7XQ_5kmDQYjDgwfCJaPsgnMUm

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [97]:
# Start a streamlit server on a background thread
public_url = ngrok.connect(port='8501')

# Display the link
public_url




PyngrokNgrokHTTPError: ngrok client exception, API returned 400: {"error_code":102,"status_code":400,"msg":"invalid tunnel configuration","details":{"err":"yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"}}
