# ETL

In [1]:
import sys
import os
import numpy as np

# Add the root folder to sys.path
sys.path.append(os.path.abspath(".."))
from app.etl import run_etl

In [3]:
#Paths to the dataset files
file_paths = {
    'ratings': '../data/ml_100k/u.data',
    'movies': '../data/ml_100k/u.item'
}

# Run the ETL pipeline
preprocessed_data = run_etl(file_paths, save_path='../app/preprocessed_movielens.csv')

# Check the resulting data
print(preprocessed_data.head())

Preprocessed data saved to ../app/preprocessed_movielens.csv
   user_id  movie_id  rating  timestamp                       title
0      195       241       3  881250949                Kolya (1996)
1      185       301       3  891717742    L.A. Confidential (1997)
2       21       376       1  878887116         Heavyweights (1994)
3      243        50       2  880606923  Legends of the Fall (1994)
4      165       345       1  886397596         Jackie Brown (1997)


# Content-Based Filtering

In [4]:
import pandas as pd

# Load the ratings dataset
ratings = pd.read_csv('../data/ml_100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Load the movies dataset
movies = pd.read_csv('../data/ml_100k/u.item', sep='|', encoding='latin-1', header=None,
                     names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
                            'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
                            'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                            'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

In [5]:
columns_to_copy = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
genres_encoded = movies[columns_to_copy].copy()

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
similarity_matrix = cosine_similarity(genres_encoded)

# Wrap it in a dataFrame for easy access
similarity_df = pd.DataFrame(similarity_matrix, index=movies['title'], columns=movies['title'])

# check the similarity matrix
print(similarity_df.head())

title              Toy Story (1995)  GoldenEye (1995)  Four Rooms (1995)  \
title                                                                      
Toy Story (1995)           1.000000          0.000000            0.00000   
GoldenEye (1995)           0.000000          1.000000            0.57735   
Four Rooms (1995)          0.000000          0.577350            1.00000   
Get Shorty (1995)          0.333333          0.333333            0.00000   
Copycat (1995)             0.000000          0.333333            0.57735   

title              Get Shorty (1995)  Copycat (1995)  \
title                                                  
Toy Story (1995)            0.333333        0.000000   
GoldenEye (1995)            0.333333        0.333333   
Four Rooms (1995)           0.000000        0.577350   
Get Shorty (1995)           1.000000        0.333333   
Copycat (1995)              0.333333        1.000000   

title              Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)  \

In [7]:
def recommend_movies(movie_title, similarity_df, top_n=5):
    # Get similarity scores for the given movie
    similar_movies = similarity_df[movie_title].sort_values(ascending=False)

    # Exclude the movie itself
    similar_movies = similar_movies.drop(movie_title)

    # Return the top N recommendations
    return similar_movies.head(top_n)


# example: Recommend similar movies to "Toy Story (1995)"
recommendations = recommend_movies("Toy Story (1995)", similarity_df)
print(recommendations)


title
Aladdin and the King of Thieves (1996)    1.000000
Aladdin (1992)                            0.866025
Goofy Movie, A (1995)                     0.866025
Jungle2Jungle (1997)                      0.816497
Angels in the Outfield (1994)             0.816497
Name: Toy Story (1995), dtype: float64


# 1. User-Based Collaborative Filtering

In [8]:
# Create a user-item matrix
user_item_matrix = ratings.pivot(index='user_id', columns='movie_id', values='rating')

# Fill NaN with 0 (users may not have rated all movies)
user_item_matrix = user_item_matrix.fillna(0)

np.save('user_item_matrix.npy', user_item_matrix)

# Check the user-item matrix
print("User-Item Matrix:")
print(user_item_matrix)

User-Item Matrix:
movie_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                               ...   
1          5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2          4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   
3          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5          4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...        ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0  ...   
940        0.0   0.0   0.0   2.0   0.0   0.0   4.0   5.0   3.0   0.0  ...   
941        5.0   0.0   0.0   0.0   0.0   0.0   4.0   0.0   0.0   0.0  ...   
942        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
943        0.0   5.0   0.0   0.0   0.0   0.0   0.0   0.0  

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)

# Wrap it in a dataFrame for easy access
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Check the user similarity matrix
print("User Similarity Matrix:")
print(user_similarity_df)

User Similarity Matrix:
user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        1.000000  0.166931  0.047460  0.064358  0.378475  0.430239  0.440367   
2        0.166931  1.000000  0.110591  0.178121  0.072979  0.245843  0.107328   
3        0.047460  0.110591  1.000000  0.344151  0.021245  0.072415  0.066137   
4        0.064358  0.178121  0.344151  1.000000  0.031804  0.068044  0.091230   
5        0.378475  0.072979  0.021245  0.031804  1.000000  0.237286  0.373600   
...           ...       ...       ...       ...       ...       ...       ...   
939      0.118095  0.228583  0.026271  0.030138  0.071459  0.111852  0.107027   
940      0.314072  0.226790  0.161890  0.196858  0.239955  0.352449  0.329925   
941      0.148617  0.161485  0.101243  0.152041  0.139595  0.144446  0.059993   
942      0.179508  0.172268  0.133416  0.170086  0.152497  0.317328  0.282003   
943 

In [10]:
import numpy as np

def recommend_user_based(user_id, user_item_matrix, user_similarity_df, top_n=5):
    # Get the similarity scores for the target user
    similar_users = user_similarity_df[user_id]

    # Calculate weighted ratings
    weighted_ratings = np.dot(similar_users, user_item_matrix) / similar_users.sum()

    # Create a dataFrame with weighted ratings
    recommendations = pd.DataFrame({
        'movie_id': user_item_matrix.columns,
        'weighted_rating': weighted_ratings
    }).sort_values(by='weighted_rating', ascending=False)

    # Exclude movies the user has already rated
    user_rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = recommendations[~recommendations['movie_id'].isin(user_rated_movies)]

    # Merge with movie titles for readability
    recommendations = recommendations.merge(movies, on='movie_id')

    return recommendations[['title', 'weighted_rating']].head(top_n)

# example: Recommend movies for userId = 1
user_recommendations = recommend_user_based(1, user_item_matrix, user_similarity_df)
print("User-Based Recommendations:")
print(user_recommendations)

User-Based Recommendations:
                                    title  weighted_rating
0                 Schindler's List (1993)         2.035432
1       E.T. the Extra-Terrestrial (1982)         1.871222
2  One Flew Over the Cuckoo's Nest (1975)         1.792122
3             English Patient, The (1996)         1.742431
4                           Scream (1996)         1.696068


#  Item-Based Collaborative Filtering

In [11]:
# Transpose user-item matrix
item_user_matrix = user_item_matrix.T

# Calculate cosine similarity between movies
item_similarity = cosine_similarity(item_user_matrix)

# Wrap it in a dataFrame for easy access
item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)

# Check the item similarity matrix
print("Item Similarity Matrix:")
print(item_similarity_df.head())

Item Similarity Matrix:
movie_id      1         2         3         4         5         6     \
movie_id                                                               
1         1.000000  0.402382  0.330245  0.454938  0.286714  0.116344   
2         0.402382  1.000000  0.273069  0.502571  0.318836  0.083563   
3         0.330245  0.273069  1.000000  0.324866  0.212957  0.106722   
4         0.454938  0.502571  0.324866  1.000000  0.334239  0.090308   
5         0.286714  0.318836  0.212957  0.334239  1.000000  0.037299   

movie_id      7         8         9         10    ...      1673  1674  \
movie_id                                          ...                   
1         0.620979  0.481114  0.496288  0.273935  ...  0.035387   0.0   
2         0.383403  0.337002  0.255252  0.171082  ...  0.000000   0.0   
3         0.372921  0.200794  0.273669  0.158104  ...  0.000000   0.0   
4         0.489283  0.490236  0.419044  0.252561  ...  0.000000   0.0   
5         0.334769  0.259161  0.2

In [12]:
def recommend_item_based(user_id, user_item_matrix, item_similarity_df, top_n=5):
    # Get the user's ratings
    user_ratings = user_item_matrix.loc[user_id]

    # Calculate scores by multiplying user's ratings with item similarity
    scores = np.dot(user_ratings, item_similarity_df) / np.array([np.abs(item_similarity_df).sum(axis=1)])
    scores = scores.flatten()

    # Create a dataFrame with scores
    recommendations = pd.DataFrame({
        'movie_id': user_item_matrix.columns,
        'score': scores
    }).sort_values(by='score', ascending=False)

    # Exclude movies the user has already rated
    user_rated_movies = user_ratings[user_ratings > 0].index
    recommendations = recommendations[~recommendations['movie_id'].isin(user_rated_movies)]

    # Merge with movie titles
    recommendations = recommendations.merge(movies, on='movie_id')

    return recommendations[['title', 'score']].head(top_n)

# example: Recommend movies for userId = 1
item_recommendations = recommend_item_based(1, user_item_matrix, item_similarity_df)
print("Item-Based Recommendations:")
print(item_recommendations)

Item-Based Recommendations:
                                       title     score
0                    King of New York (1990)  1.528732
1  Scream of Stone (Schrei aus Stein) (1991)  1.503711
2                      Jupiter's Wife (1994)  1.488788
3                             Witness (1985)  1.462689
4                     All Things Fair (1996)  1.414305


# Matrix Factorization (e.g., SVD) Collaborative Filtering

In [13]:
# Fill missing values with 0
user_item_matrix = user_item_matrix.fillna(0)

print("User-Item Matrix:")
print(user_item_matrix.head())

User-Item Matrix:
movie_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                               ...   
1          5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2          4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   
3          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5          4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

movie_id  1673  1674  1675  1676  1677  1678  1679  1680  1681  1682  
user_id                                                               
1          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
5          0.0  

In [14]:
from sklearn.decomposition import TruncatedSVD

# Apply Truncated SVD
n_components = 20  # Number of latent features
svd = TruncatedSVD(n_components=n_components)
user_features = svd.fit_transform(user_item_matrix)
item_features = svd.components_

# Check the shapes
print("Shape of user_features:", user_features.shape)  # (n_users, n_components)
print("Shape of item_features:", item_features.shape)  # (n_components, n_movies)


Shape of user_features: (943, 20)
Shape of item_features: (20, 1682)


In [15]:
import numpy as np

# Reconstruct the approximate user-item matrix
reconstructed_matrix = np.dot(user_features, item_features)

# Wrap it in a dataFrame for easy access
predicted_ratings = pd.DataFrame(reconstructed_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)

print("Predicted Ratings Matrix:")
print(predicted_ratings.head())

Predicted Ratings Matrix:
movie_id      1         2         3         4         5         6     \
user_id                                                                
1         4.168742  2.257471  1.597417  3.101478  0.865192  0.679666   
2         1.988409 -0.075958 -0.003907  0.487492 -0.044064  0.298416   
3        -0.152413 -0.024778  0.169515 -0.180050 -0.123459  0.055948   
4         0.541978 -0.162967  0.056590 -0.155267  0.078064  0.007859   
5         3.499255  1.191323  0.285047  1.923982  0.391172 -0.207930   

movie_id      7         8         9         10    ...      1673      1674  \
user_id                                           ...                       
1         4.833356  2.447116  3.640474  1.864810  ... -0.017046  0.015924   
2         1.624070  0.413328  2.497702  0.598847  ... -0.000148 -0.015717   
3        -0.151615  0.124535 -0.332404  0.020384  ...  0.004127 -0.014259   
4         0.349775 -0.086234 -0.133027 -0.196605  ...  0.002065 -0.009193   
5      

In [16]:
def recommend_svd(user_id, predicted_ratings, movies, top_n=5):
    # Get the user's predicted ratings
    user_predicted_ratings = predicted_ratings.loc[user_id]

    # Sort by predicted rating in descending order
    recommendations = user_predicted_ratings.sort_values(ascending=False)

    # Exclude movies the user has already rated
    user_rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = recommendations[~recommendations.index.isin(user_rated_movies)]

    # Reset index and rename predicted ratings
    recommendations = recommendations.reset_index()
    recommendations.columns = ['movie_id', 'predicted_rating']  # Rename columns for clarity

    # Merge with movie titles
    recommendations = recommendations.merge(movies, on='movie_id')

    return recommendations[['title', 'predicted_rating']].head(top_n)


svd_recommendations = recommend_svd(1, predicted_ratings, movies)
print("SVD Recommendations:")
print(svd_recommendations)

SVD Recommendations:
                               title  predicted_rating
0               Trainspotting (1996)          4.543125
1           Leaving Las Vegas (1995)          3.688342
2        English Patient, The (1996)          3.420426
3  E.T. the Extra-Terrestrial (1982)          3.213485
4                  Casablanca (1942)          3.179610


# Deep Learning-Based Recommendation Engines

## Autoencoders for Collaborative Filtering

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the user-item matrix (row-wise normalization)
normalized_matrix = scaler.fit_transform(user_item_matrix)

# Convert back to a dataFrame
normalized_matrix = pd.DataFrame(normalized_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)

# Split into training and test sets
train, test = train_test_split(normalized_matrix, test_size=0.2, random_state=42)

# Convert to NumPy arrays
train = train.values
test = test.values

print("Training Shape:", train.shape)
print("Testing Shape:", test.shape)


Training Shape: (754, 1682)
Testing Shape: (189, 1682)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import plot_model

# Define the autoencoder architecture
n_movies = user_item_matrix.shape[1]  # Number of movies (columns)

autoencoder = models.Sequential([
    layers.Input(shape=(n_movies,)),  # Input layer
    layers.Dense(128, activation='relu'),  # Hidden layer (encoding)
    layers.Dense(64, activation='relu'),   # Bottleneck (latent features)
    layers.Dense(128, activation='relu'),  # Hidden layer (decoding)
    layers.Dense(n_movies, activation='sigmoid')  # Output layer (reconstructed ratings)
])

autoencoder.compile(optimizer='adam', loss='mse')  # Mean Squared Error for reconstruction loss

# Summary
autoencoder.summary()

plot_model(autoencoder, to_file="autoencoder_architecture.png", show_shapes=True, show_layer_names=True)
print("Autoencoder architecture saved as 'autoencoder_architecture.png'")

In [None]:
# Train the autoencoder
history = autoencoder.fit(
    train, train,
    epochs=20,
    batch_size=32,
    validation_data=(test, test),
    verbose=1
)

In [22]:
# Predict the reconstructed user-item matrix
reconstructed_matrix = autoencoder.predict(normalized_matrix)

# Ensure the reconstructed matrix matches the original dataFrame's shape
predicted_ratings = pd.DataFrame(reconstructed_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)

print("Predicted Ratings (Autoencoder):")
print(predicted_ratings.head())

[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Predicted Ratings (Autoencoder):
movie_id      1             2             3         4         5     \
user_id                                                              
1         0.999964  9.999831e-01  9.971140e-01  1.000000  0.835094   
2         0.007410  1.314198e-09  2.309043e-05  0.000018  0.001771   
3         0.000002  2.123803e-07  4.557213e-06  0.007998  0.001848   
4         0.000088  7.756667e-09  7.194463e-06  0.000071  0.004572   
5         0.790437  8.885342e-01  5.293274e-07  0.008950  0.002607   

movie_id          6         7             8             9         10    ...  \
user_id                                                                 ...   
1         9.812189e-01  1.000000  9.700170e-01  9.556419e-01  0.907717  ...   
2         8.616745e-05  0.000644  1.161575e-03  2.806466e-01  0.007110  ...   
3         3.000294e-04  0.000844  5.888203e-07  9.520559e-07  0.000004  ...   
4      

In [25]:
def recommend_deep_learning(user_id, predicted_ratings, movies, top_n=5):
    # Get the user's predicted ratings
    user_predicted_ratings = predicted_ratings.loc[user_id].sort_values(ascending=False)

    # Exclude movies the user has already rated
    user_rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = user_predicted_ratings[~user_predicted_ratings.index.isin(user_rated_movies)]

    # Convert recommendations to dataFrame
    recommendations = recommendations.reset_index()
    recommendations.columns = ['movie_id', 'predicted_rating']  # Rename columns for clarity

    # Merge with movie titles
    recommendations = recommendations.merge(movies, on='movie_id')

    return recommendations[['title', 'predicted_rating']].head(top_n)

# example: Recommend movies for userId = 1
deep_learning_recommendations = recommend_deep_learning(1, predicted_ratings, movies)
print("Deep Learning Recommendations:")
print(deep_learning_recommendations)


Deep Learning Recommendations:
                                               title  predicted_rating
0  Dr. Strangelove or: How I Learned to Stop Worr...          1.000000
1                                 Stand by Me (1986)          1.000000
2                                      Batman (1989)          1.000000
3                  Jackie Chan's First Strike (1996)          1.000000
4      William Shakespeare's Romeo and Juliet (1996)          0.999999


## RNNs for recommendation

In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the MovieLens dataset
df = ratings.copy()  
movies = movies.copy()  # Movie metadata

# Sort by timestamp to create sequences
df = df.sort_values(by=['user_id', 'timestamp'])

# Map users and movies to unique IDs
user_ids = df['user_id'].unique()
movie_ids = df['movie_id'].unique()

user_id_map = {user: idx for idx, user in enumerate(user_ids)}
movie_id_map = {movie: idx for idx, movie in enumerate(movie_ids)}

np.save('movie_id_map.npy', movie_id_map)

ratings['user_id'] = ratings['user_id'].map(user_id_map)
ratings['movie_id'] = ratings['movie_id'].map(movie_id_map)


In [62]:
# Group ratings by user and create sequences of movie IDs
user_sequences = ratings.groupby('user_id')['movie_id'].apply(list)

# Define sequence length
sequence_length = 5

# Create input (X) and output (y) sequences
X, y = [], []
for seq in user_sequences:
    for i in range(len(seq) - sequence_length):
        X.append(seq[i:i + sequence_length])  # Last `sequence_length` movies
        y.append(seq[i + sequence_length])   # Next movie

X = np.array(X)
y = np.array(y)

# Verify the maximum movie ID (should match input_dim for the embedding layer)
num_movies = len(movie_id_map)
print(f"Number of movies: {num_movies}, Max movie_id in X: {X.max()}")


# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pad sequences to ensure consistent input length
X_train = pad_sequences(X_train, maxlen=sequence_length, padding='pre')
X_test = pad_sequences(X_test, maxlen=sequence_length, padding='pre')

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")


# Model parameters
embedding_size = 50

# Define the RNN model
model = Sequential([
    Embedding(input_dim=num_movies, output_dim=embedding_size, input_length=sequence_length),
    LSTM(128, return_sequences=False),  # Use LSTM to process sequences
    Dense(num_movies, activation='softmax')  # Output layer for movie prediction
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()


# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,  # Adjust epochs as needed
    batch_size=64,
    verbose=1
)


Number of movies: 1682, Max movieId in X: 1681
X_train shape: (76228, 5), y_train shape: (76228,)


Epoch 1/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 13ms/step - accuracy: 0.0049 - loss: 6.9022 - val_accuracy: 0.0060 - val_loss: 6.7625
Epoch 2/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 14ms/step - accuracy: 0.0045 - loss: 6.7320 - val_accuracy: 0.0054 - val_loss: 6.6801
Epoch 3/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - accuracy: 0.0050 - loss: 6.6038 - val_accuracy: 0.0068 - val_loss: 6.6070
Epoch 4/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.0066 - loss: 6.5250 - val_accuracy: 0.0055 - val_loss: 6.5892
Epoch 5/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 14ms/step - accuracy: 0.0066 - loss: 6.4703 - val_accuracy: 0.0062 - val_loss: 6.5727
Epoch 6/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.0081 - loss: 6.4041 - val_accuracy: 0.0057 - val_loss: 6.5824
Epoc

In [64]:
def recommend_next_movies(user_sequence, model, top_n=5):
    """
    Recommend the next movies for a given sequence.
    
    Args:
    - user_sequence: List of movie IDs (most recent interactions).
    - model: Trained RNN model.
    - top_n: Number of recommendations to return.

    Returns:
    - List of recommended movie IDs.
    """
    # Pad the sequence to match the model's input length
    padded_sequence = pad_sequences([user_sequence], maxlen=sequence_length, padding='pre')

    # Predict the probabilities for the next movie
    predictions = model.predict(padded_sequence, verbose=0)
    top_movie_ids = np.argsort(predictions[0])[-top_n:][::-1]  # Top-N movie indices

    # Map back to original movie IDs
    return [list(movie_id_map.keys())[idx] for idx in top_movie_ids]

# example: Recommend the next movies for a user
sample_user_sequence = X_test[0]  # Use a sequence from the test set
recommended_movies = recommend_next_movies(sample_user_sequence, model)
print("Recommended Movies:", recommended_movies)

# Map movie IDs back to titles
recommended_titles = movies[movies['movie_id'].isin(recommended_movies)]['title']
print("Recommended Movie Titles:", recommended_titles.tolist())

Recommended Movies: [np.int64(173), np.int64(210), np.int64(679), np.int64(68), np.int64(385)]
Recommended Movie Titles: ['Crow, The (1994)', 'Princess Bride, The (1987)', 'Indiana Jones and the Last Crusade (1989)', 'True Lies (1994)', 'Conan the Barbarian (1981)']


In [65]:
# Save the trained model
model.save('autoencoder_model.h5')



In [70]:
import tensorflow as tf; 

print(tf.__version__)

import keras; 
print(keras.__version__)


2.18.0
3.8.0
