In [98]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [99]:
df=pd.read_csv('ratings.csv')

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028751 entries, 0 to 1028750
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   user_id           1028751 non-null  object 
 1   movie_id          1028751 non-null  int64  
 2   rating            1028751 non-null  float64
 3   user_id_encoded   1028751 non-null  int32  
 4   movie_id_encoded  1028751 non-null  int64  
dtypes: float64(1), int32(1), int64(2), object(1)
memory usage: 35.3+ MB


In [32]:
df.sample(10)

Unnamed: 0,user_id,movie_id,rating
32022,0c6e74d0739fb94e2f3c8e5605497bbd,42163,4.0
518145,6ef5ac1764724bb3f766b13a8313d5a7,50,4.0
832771,192ff60bd8841ae6727ed1861e60156b,63082,4.5
603137,be855df6805c42f788b56e5878294a5d,52016,2.0
6684,77c7d756a093150d4377720abeaeef76,2429,2.0
784,8e7cebf9a234c064b75016249f2ac65e,1385,2.0
798809,b71ce593dfe831739fa40cc58af648a6,380,3.5
654402,dda12d420187937de1e6769080e777c7,95873,4.0
788825,075d07460cd55b9f0eb977b5b1929759,86347,4.5
602094,12f0fe1f25f4d80a23695949014e8954,103255,3.5


In [100]:
df=df.rename(columns={
    'useri':'user_id',
    ' movie_id':'movie_id',
    ' rating':'rating',

    
})

df.drop(' tstamp ',axis=1,inplace=True)

## Recommendation Algorithms

- Collaborative Filtering: Matrix factorization, KNN, ALS.
- Content-Based Filtering: TF-IDF, cosine similarity, Bayesian classifiers.
- Hybrid Approaches: Hybrid MF, Factorization Machines, RBM.
- Deep Learning-Based: Neural CF, Autoencoders, CNNs, RNNs, Two Tower Network
- Graph-Based: GNNs, Personalized PageRank.

## Two Tower Network using Tensorflow/Keras

## Data Preprocessing

In [117]:

user_encoder = LabelEncoder()
df['user_id_encoded'] = user_encoder.fit_transform(df['user_id'])

movie_encoder = LabelEncoder()
df['movie_id_encoded'] = movie_encoder.fit_transform(df['movie_id'])

In [118]:
df.head()

Unnamed: 0,user_id,movie_id,rating,user_id_encoded,movie_id_encoded
0,8e7cebf9a234c064b75016249f2ac65e,1,5.0,1006,0
1,8e7cebf9a234c064b75016249f2ac65e,2,4.0,1006,1
2,8e7cebf9a234c064b75016249f2ac65e,3,4.0,1006,2
3,8e7cebf9a234c064b75016249f2ac65e,5,5.0,1006,4
4,8e7cebf9a234c064b75016249f2ac65e,6,4.0,1006,5


In [85]:
df.columns

Index(['user_id', 'movie_id', 'rating', 'user_id_encoded', 'movie_id_encoded'], dtype='object')

In [122]:
# Train-test split
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Extract features and labels for training
train_user_ids = train['user_id_encoded'].values
train_movie_ids = train['movie_id_encoded'].values
train_ratings = train['rating'].values

test_user_ids = test['user_id_encoded'].values
test_movie_ids = test['movie_id_encoded'].values
test_ratings = test['rating'].values


## Two Tower Model

In [123]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.models import Model

In [125]:

# Number of unique users and movies
num_users = df['user_id'].nunique()
num_movies = df['movie_id'].nunique()
embedding_dim = 10  # Number of dimensions for the embedding

# User tower
# placeholder that expects that each input will be a single value (user ID)
user_input = Input(shape=(1,), name='user_input')
#convert the user ID into a dense embedding vector of size embedding_dim(50)
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
#remove extra dimension so the embedding becomes a simple 1D vector of size embedding_dim for(50) each user 
user_embedding = Flatten()(user_embedding)

# Movie tower
# placeholder that expects that each input will be a single value (movie ID)
movie_input = Input(shape=(1,), name='movie_input')
#convert the movie ID into a dense embedding vector of size embedding_dim(50)
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_dim, name='movie_embedding')(movie_input)
#remove extra dimension so the embedding becomes a simple 1D vector of size embedding_dim for(50) each movie
movie_embedding = Flatten()(movie_embedding)

# Dot product of user and movie embeddings to predict rating
dot_product = Dot(axes=1)([user_embedding, movie_embedding])

# Output layer for predicting rating
output = Dense(1, activation='linear')(dot_product)




In [134]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# Create the model
model = Model(inputs=[user_input, movie_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
history = model.fit([train_user_ids, train_movie_ids], train_ratings, 
                    epochs=5, 
                    batch_size=64, 
                    validation_data=([test_user_ids, test_movie_ids], test_ratings))

Epoch 1/5
[1m12860/12860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 5ms/step - loss: 0.5260 - val_loss: 0.6272
Epoch 2/5
[1m12860/12860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 6ms/step - loss: 0.4995 - val_loss: 0.6279
Epoch 3/5
[1m12860/12860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 5ms/step - loss: 0.4802 - val_loss: 0.6290
Epoch 4/5
[1m12860/12860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 5ms/step - loss: 0.4696 - val_loss: 0.6294
Epoch 5/5
[1m12860/12860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 6ms/step - loss: 0.4630 - val_loss: 0.6305


In [16]:
model.summary()

## Model Evaluation

In [135]:
# Evaluate on the test set
test_loss = model.evaluate([test_user_ids, test_movie_ids], test_ratings)
print(f"Test loss (MSE): {test_loss}")

[1m6430/6430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - loss: 0.6279
Test loss (MSE): 0.6305448412895203


In [136]:
# Make predictions on the full test set
test_predictions = model.predict([test_user_ids, test_movie_ids])

# Add the original user IDs, movie IDs, actual ratings, and predicted ratings to a DataFrame
test_results = pd.DataFrame({
    'original_user_id': user_encoder.inverse_transform(test_user_ids),  # Convert encoded user IDs back to original
    'original_movie_id': movie_encoder.inverse_transform(test_movie_ids),  # Convert encoded movie IDs back to original
    'actual_rating': test_ratings,
    'predicted_rating': test_predictions.flatten()  # Flatten to a 1D array
})



[1m6430/6430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step


In [140]:
test_results.sample(10)

Unnamed: 0,original_user_id,original_movie_id,actual_rating,predicted_rating
126964,4404f0dcbaa48eb84d53abf85a6534d7,7203,4.0,3.018996
28106,dd9a68f52fd23e04592bdf08af807371,68347,4.5,4.263574
164168,7500db258d55789ef86cbb29ffcb2016,267,0.5,1.738303
111790,f6ce380bdf17ac1411ed7b13bee948e3,90403,3.0,2.404511
161510,7cfe973cfd3353ecacc3ec1e53a1c5ea,2827,1.5,2.737557
205350,2e5e394fabcf87b202187c5b8c404cd5,68157,4.0,4.314604
14597,cfdf6bca597d8e26773cbafce04368b0,5945,1.5,0.934399
180189,8da0624bf90b42c5036563806962fa0f,141456,4.0,3.125163
194976,2c88d52366195507be47f7fcf4dca4d1,114935,5.0,4.312302
42595,e46267204fe0da35710877834f57a3b4,45722,2.0,2.873377


In [141]:
# Calculate the difference between actual and predicted ratings
test_results['rating_difference'] = abs(test_results['actual_rating'] - test_results['predicted_rating'])

# Filter by a specific user ID, for example:
filter_user_id = '8e7cebf9a234c064b75016249f2ac65e'  # Replace with the desired user ID

# Filter the DataFrame by the original user ID
filtered_results = test_results[test_results['original_user_id'] == filter_user_id]

# Sort by the closest rating difference (smallest difference first)
filtered_results_sorted = filtered_results.sort_values(by='rating_difference')

# Display sorted results
print(f"Predictions for user {filter_user_id}, ordered by closest rating difference:")
filtered_results_sorted

Predictions for user 8e7cebf9a234c064b75016249f2ac65e, ordered by closest rating difference:


Unnamed: 0,original_user_id,original_movie_id,actual_rating,predicted_rating,rating_difference
89960,8e7cebf9a234c064b75016249f2ac65e,4014,4.0,4.000362,0.000362
40795,8e7cebf9a234c064b75016249f2ac65e,47610,3.5,3.502150,0.002150
199452,8e7cebf9a234c064b75016249f2ac65e,7980,3.5,3.502495,0.002495
26733,8e7cebf9a234c064b75016249f2ac65e,26689,3.0,3.004020,0.004020
15931,8e7cebf9a234c064b75016249f2ac65e,3114,5.0,5.004139,0.004139
...,...,...,...,...,...
88350,8e7cebf9a234c064b75016249f2ac65e,4574,1.0,3.528172,2.528172
198013,8e7cebf9a234c064b75016249f2ac65e,192391,4.0,1.339447,2.660553
155089,8e7cebf9a234c064b75016249f2ac65e,5004,5.0,2.134781,2.865219
194056,8e7cebf9a234c064b75016249f2ac65e,4308,1.0,4.020574,3.020574
