<a href="https://colab.research.google.com/github/IdajiliJohnOjochegbe/Movie-Recommendation/blob/main/collaborative_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


installing scikit-surprise

In [2]:
pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357256 sha256=c93a7598a6645f12003062a96103ad95454aeb39be7cb6ef79c7575376239652
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

In [3]:
import os
import pandas as pd

# Define the directory path
directory_path = '/content/drive/MyDrive/ml-100k/ml-100k'

# List the contents of the directory
print("Contents of the directory:", os.listdir(directory_path))

# Specify the file path for the u.data file
file_path = os.path.join(directory_path, 'u.data')

# Load the dataset into a pandas DataFrame
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv(file_path, sep='\t', names=column_names)

# Take a look at the first few rows of the data
print(data.head())


Contents of the directory: ['allbut.pl', 'mku.sh', 'u1.test', 'u2.base', 'u1.base', 'u.genre', 'u.info', 'u.occupation', 'u.user', 'u.data', 'u2.test', 'README', 'u.item', 'u3.base', 'u3.test', 'u5.base', 'ua.test', 'u4.base', 'ub.base', 'u5.test', 'ub.test', 'ua.base', 'u4.test']
   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [4]:
!pip install tensorflow tensorflow-recommenders


Collecting tensorflow-recommenders
  Downloading tensorflow_recommenders-0.7.3-py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-recommenders
Successfully installed tensorflow-recommenders-0.7.3


In [5]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import numpy as np

# Ensure TensorFlow and TensorFlow Recommenders are installed
assert tf.__version__ >= "2.0"
assert tfrs.__version__ >= "0.6.0"

# Load data (assuming it's already loaded in a DataFrame named `data`)
# data = pd.read_csv(file_path, sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Convert the DataFrame to a TensorFlow Dataset
ratings = tf.data.Dataset.from_tensor_slices({
    "user_id": data["user_id"].values,
    "item_id": data["item_id"].values,
    "rating": data["rating"].values
})

# Define user and movie vocabularies
user_ids = data["user_id"].unique()
movie_ids = data["item_id"].unique()

# Shuffle and split the data into training and test sets
tf.random.set_seed(42)
shuffled = ratings.shuffle(len(data), seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(0.8 * len(data)))
test = shuffled.skip(int(0.8 * len(data)))


In [6]:
# Convert user and movie IDs to strings
data["user_id"] = data["user_id"].astype(str)
data["item_id"] = data["item_id"].astype(str)

# Get unique user and movie IDs
user_ids = data["user_id"].unique().tolist()
movie_ids = data["item_id"].unique().tolist()


In [7]:
class MovieLensModel(tfrs.Model):
  def __init__(self, user_vocab, movie_vocab):
    super().__init__()
    embedding_dimension = 32

    # User and item embeddings
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(vocabulary=user_vocab, mask_token=None),
      tf.keras.layers.Embedding(len(user_vocab) + 1, embedding_dimension)
    ])
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(vocabulary=movie_vocab, mask_token=None),
      tf.keras.layers.Embedding(len(movie_vocab) + 1, embedding_dimension)
    ])

    # Rating prediction layer
    self.ratings = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      tf.keras.layers.Dense(1)
    ])

    # Task to optimize
    self.task = tfrs.tasks.Ranking(
      loss=tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def compute_loss(self, features, training=False):
    # We pick out the user IDs and movie IDs from the features.
    user_embeddings = self.user_embeddings(features["user_id"])
    movie_embeddings = self.movie_embeddings(features["item_id"])

    # We compute the predictions.
    ratings = self.ratings(tf.concat([user_embeddings, movie_embeddings], axis=1))

    # We compute the loss and metrics.
    return self.task(
      labels=features["rating"],
      predictions=ratings,
    )

# Instantiate the model
model = MovieLensModel(user_ids, movie_ids)


In [10]:
# Convert user and item IDs to strings
data["user_id"] = data["user_id"].astype(str)
data["item_id"] = data["item_id"].astype(str)

# Get unique user and movie IDs
user_ids = data["user_id"].unique().tolist()
movie_ids = data["item_id"].unique().tolist()


In [11]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import numpy as np

# Ensure user and item IDs are strings
data["user_id"] = data["user_id"].astype(str)
data["item_id"] = data["item_id"].astype(str)

# Convert the DataFrame to a TensorFlow Dataset
ratings = tf.data.Dataset.from_tensor_slices({
    "user_id": data["user_id"].values,
    "item_id": data["item_id"].values,
    "rating": data["rating"].values
})

# Shuffle and split the data into training and test sets
tf.random.set_seed(42)
shuffled = ratings.shuffle(len(data), seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(0.8 * len(data)))
test = shuffled.skip(int(0.8 * len(data)))

# Batch and cache the datasets
cached_train = train.batch(8192).cache()
cached_test = test.batch(4096).cache()


In [12]:
class MovieLensModel(tfrs.Model):
  def __init__(self, user_vocab, movie_vocab):
    super().__init__()
    embedding_dimension = 32

    # User and item embeddings
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(vocabulary=user_vocab, mask_token=None),
      tf.keras.layers.Embedding(len(user_vocab) + 1, embedding_dimension)
    ])
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(vocabulary=movie_vocab, mask_token=None),
      tf.keras.layers.Embedding(len(movie_vocab) + 1, embedding_dimension)
    ])

    # Rating prediction layer
    self.ratings = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),

      tf.keras.layers.Dense(1)
    ])

    # Task to optimize
    self.task = tfrs.tasks.Ranking(
      loss=tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def compute_loss(self, features, training=False):
    # We pick out the user IDs and movie IDs from the features.
    user_embeddings = self.user_embeddings(features["user_id"])
    movie_embeddings = self.movie_embeddings(features["item_id"])

    # We compute the predictions.
    ratings = self.ratings(tf.concat([user_embeddings, movie_embeddings], axis=1))

    # We compute the loss and metrics.
    return self.task(
      labels=features["rating"],
      predictions=ratings,
    )

# Instantiate the model
model = MovieLensModel(user_ids, movie_ids)


In [13]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

# Train the model
model.fit(cached_train, epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7b8cc625f940>

In [14]:
# Evaluate the model
model.evaluate(cached_test, return_dict=True)




{'root_mean_squared_error': 0.9449378848075867,
 'loss': 0.8783496022224426,
 'regularization_loss': 0,
 'total_loss': 0.8783496022224426}