# **Gemma as a Recommendation System**
# *Fine Tuning Gemma 2B Instruction Model for Recommendation Using LoRA*


## Setup to load the Model from Kaggle

In [None]:
import os
from google.colab import userdata

# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
# vars as appropriate for your system.

os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
os.environ["KAGGLE_KEY"] =  userdata.get('KAGGLE_KEY')

In [None]:
# Install Keras 3 last. See https://keras.io/getting_started/ for more details.
!pip install -q -U keras-nlp
!pip install -q -U "keras>=3"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m792.1/792.1 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## Setup for backend

In [None]:

os.environ["KERAS_BACKEND"] = "jax"  # Or "torch" or "tensorflow".
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

## Loading the Dataset

In [None]:
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Loading the preprocessed Dataset
f = open("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/train_prompt_set.json", "r")
prompts = json.load(f)
f.close()

In [None]:
print(prompts[0])

<bos>
<start_of_turn>user
Given below is a user's profile and a target movie. The profile contains up to 10 movies the user has rated.
Your task is to predict the user's rating for the target movie.

User Profile:
Movie name: "Kolya (1996)", Movie Genre: Comedy Rating: 3,
Movie name: "Mrs. Doubtfire (1993)", Movie Genre: Comedy Rating: 4,
Movie name: "Muriel's Wedding (1994)", Movie Genre: Comedy, Romance Rating: 4,
Movie name: "Shall We Dance? (1996)", Movie Genre: Comedy Rating: 3,
Movie name: "Stand by Me (1986)", Movie Genre: Adventure, Comedy, Drama Rating: 5,
Movie name: "Ace Ventura: Pet Detective (1994)", Movie Genre: Comedy Rating: 5,
Movie name: "Mrs. Brown (Her Majesty, Mrs. Brown) (1997)", Movie Genre: Drama, Romance Rating: 4,
Movie name: "Raising Arizona (1987)", Movie Genre: Comedy Rating: 4,
Movie name: "Being There (1979)", Movie Genre: Comedy Rating: 5,
Movie name: "Truth About Cats & Dogs, The (1996)", Movie Genre: Comedy, Romance Rating: 4

Target movie: Kolya (1996

In [None]:
len(prompts)

79619

## Loading the Model

In [None]:
import keras
import keras_nlp

In [None]:
# gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma2_2b_en")

gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_instruct_2b_en")
gemma_lm.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_instruct_2b_en/3/download/config.json...


100%|██████████| 785/785 [00:00<00:00, 1.00MB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_instruct_2b_en/3/download/model.weights.h5...


100%|██████████| 4.67G/4.67G [02:15<00:00, 36.9MB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_instruct_2b_en/3/download/tokenizer.json...


100%|██████████| 591/591 [00:00<00:00, 898kB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_instruct_2b_en/3/download/assets/tokenizer/vocabulary.spm...


100%|██████████| 4.04M/4.04M [00:00<00:00, 5.24MB/s]


### Prompt for training the model.

In [None]:
print(prompts[0])

<bos>
<start_of_turn>user
Given below is a user's profile and a target movie. The profile contains up to 10 movies the user has rated.
Your task is to predict the user's rating for the target movie.

User Profile:
Movie name: "Kolya (1996)", Movie Genre: Comedy Rating: 3,
Movie name: "Mrs. Doubtfire (1993)", Movie Genre: Comedy Rating: 4,
Movie name: "Muriel's Wedding (1994)", Movie Genre: Comedy, Romance Rating: 4,
Movie name: "Shall We Dance? (1996)", Movie Genre: Comedy Rating: 3,
Movie name: "Stand by Me (1986)", Movie Genre: Adventure, Comedy, Drama Rating: 5,
Movie name: "Ace Ventura: Pet Detective (1994)", Movie Genre: Comedy Rating: 5,
Movie name: "Mrs. Brown (Her Majesty, Mrs. Brown) (1997)", Movie Genre: Drama, Romance Rating: 4,
Movie name: "Raising Arizona (1987)", Movie Genre: Comedy Rating: 4,
Movie name: "Being There (1979)", Movie Genre: Comedy Rating: 5,
Movie name: "Truth About Cats & Dogs, The (1996)", Movie Genre: Comedy, Romance Rating: 4

Target movie: Kolya (1996

In [None]:
# Model output before training

gemma_lm.generate(prompts[0])

'<bos>\n<start_of_turn>user\nGiven below is a user\'s profile and a target movie. The profile contains up to 10 movies the user has rated.\nYour task is to predict the user\'s rating for the target movie.\n\nUser Profile:\nMovie name: "Kolya (1996)", Movie Genre: Comedy Rating: 3,\nMovie name: "Mrs. Doubtfire (1993)", Movie Genre: Comedy Rating: 4,\nMovie name: "Muriel\'s Wedding (1994)", Movie Genre: Comedy, Romance Rating: 4,\nMovie name: "Shall We Dance? (1996)", Movie Genre: Comedy Rating: 3,\nMovie name: "Stand by Me (1986)", Movie Genre: Adventure, Comedy, Drama Rating: 5,\nMovie name: "Ace Ventura: Pet Detective (1994)", Movie Genre: Comedy Rating: 5,\nMovie name: "Mrs. Brown (Her Majesty, Mrs. Brown) (1997)", Movie Genre: Drama, Romance Rating: 4,\nMovie name: "Raising Arizona (1987)", Movie Genre: Comedy Rating: 4,\nMovie name: "Being There (1979)", Movie Genre: Comedy Rating: 5,\nMovie name: "Truth About Cats & Dogs, The (1996)", Movie Genre: Comedy, Romance Rating: 4\n\nTarg

In [None]:
# Enabling LoRA for the model (LoRA rank to 4)
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

In [None]:
# Limit the input sequence length to 256 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 1000
# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

In [None]:
from keras.callbacks import Callback

# Function to save the model after each epoch

class SaveLoRAWeightsCallback(Callback):
    def __init__(self, base_path):
        super().__init__()
        self.base_path = base_path

    def on_epoch_end(self, epoch, logs=None):
        filename = f"{self.base_path}_epoch_{epoch + 1:02d}.lora.h5"
        self.model.backbone.save_lora_weights(filename)
        print(f"\nLoRA weights saved to {filename}")

In [None]:
save_callback = SaveLoRAWeightsCallback("/content/drive/MyDrive/Capstone_project_data/RecSys_Gemma_Inst_model_All_DP_2_epoch_plus")


In [None]:
gemma_lm.fit(prompts, epochs=10, batch_size=1, callbacks=[save_callback])

Epoch 1/10
[1m79586/79619[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m3s[0m 111ms/step - loss: 0.0211 - sparse_categorical_accuracy: 0.9907
LoRA weights saved to /content/drive/MyDrive/Capstone_project_data/RecSys_Gemma_Inst_model_All_DP_2_epoch_plus_epoch_01.lora.h5
[1m79619/79619[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8869s[0m 111ms/step - loss: 0.0211 - sparse_categorical_accuracy: 0.9907
Epoch 2/10
[1m79489/79619[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m14s[0m 111ms/step - loss: 0.0192 - sparse_categorical_accuracy: 0.9914
LoRA weights saved to /content/drive/MyDrive/Capstone_project_data/RecSys_Gemma_Inst_model_All_DP_2_epoch_plus_epoch_02.lora.h5
[1m79619/79619[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8844s[0m 111ms/step - loss: 0.0192 - sparse_categorical_accuracy: 0.9914
Epoch 3/10
[1m46958/79619[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m1:12:06[0m 132ms/step - loss: 0.0183 - sparse_categorical_accuracy: 0.9918

In [None]:
gemma_lm.generate("<bos>\n<start_of_turn>user\nGiven below is a user\'s profile and a target movie. The profile contains up to 10 movies the user has rated.\nYour task is to predict the user\'s rating for the target movie.\n\nUser Profile:\nMovie name: \"Broken Arrow (1996)\", Movie Genre: Action, Thriller Rating: 2,\nMovie name: \"Sword in the Stone, The (1963)\", Movie Genre: Animation, Children\'s Rating: 4,\nMovie name: \"Roommates (1995)\", Movie Genre: Comedy, Drama Rating: 2,\nMovie name: \"2001: A Space Odyssey (1968)\", Movie Genre: Drama, Mystery, Sci-Fi, Thriller Rating: 3,\nMovie name: \"Assassins (1995)\", Movie Genre: Thriller Rating: 3,\nMovie name: \"Under Siege (1992)\", Movie Genre: Action Rating: 4,\nMovie name: \"Crow, The (1994)\", Movie Genre: Action, Romance, Thriller Rating: 4,\nMovie name: \"Toy Story (1995)\", Movie Genre: Animation, Children\'s, Comedy Rating: 5,\nMovie name: \"Gandhi (1982)\", Movie Genre: Drama Rating: 4,\nMovie name: \"Empire Strikes Back, The (1980)\", Movie Genre: Action, Adventure, Drama, Romance, Sci-Fi, War Rating: 4\n\nTarget movie: Broken Arrow (1996), target movie genre: Action, Thriller\nPredict the rating the user will give the target movie by selecting a number between 0-5.\nWhat will the user rate the target movie \"Broken Arrow (1996)\"?\n<end_of_turn>\n<start_of_turn>model")

In [None]:

gemma_lm.backbone.save_lora_weights("/content/drive/MyDrive/Capstone_project_data/Test_RecSys_Gemma_Inst_model_All_DP_2_epoch_plus_epoch_05.lora.h5")

#Testing the model

## Preparing the test dataset

In [None]:
# test_df = pd.read_csv("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/test_data.csv")

In [None]:
# test_df

In [None]:
# user_profile_df = pd.read_csv("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/user_profile.csv")

In [None]:
# user_profile_df

In [None]:
# movies_data = pd.read_csv("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/movies_data.csv")

In [None]:
# movies_data

In [None]:
# test_prompt_set = []

# actual_ratings = []
# for _, row in test_df.iterrows():
#     prompt = "Given below is a user's profile and a target movie. The profile contains up to 10 movies the user has rated.\nYour task is to predict the user's rating for the target movie."
#     user_id = int(row['user_id'])
#     item_id = int(row['item_id'])
#     rating = int(row['rating'])

#     user_profile = user_profile_df[user_profile_df["user_id"] == user_id]["train_movies"].values[0]
#     target_movie = movies_data[movies_data["movie_id"] == item_id]["movie_title"].values[0]
#     target_movie_genre = movies_data[movies_data["movie_id"] == item_id]["genre_string"].values[0]

#     # Adding the tags that are required for fine-tuning gemma-2b-it
#     formatted_prompt = (
#         "<bos>\n"
#         "<start_of_turn>user\n"
#         f"{prompt}\n\n"
#         f"User Profile:\n{user_profile}\n\n"
#         f"Target movie: {target_movie}, target movie genre: {target_movie_genre}\n"
#         f"Predict the rating the user will give the target movie by selecting a number between 0-5.\n"
#         f"What will the user rate the target movie \"{target_movie}\"?\n"
#         "<end_of_turn>\n"
#         "<start_of_turn>model\n"
#     )

#     test_prompt_set.append(formatted_prompt)
#     actual_ratings.append(rating)
#     print(f"{formatted_prompt}\n__________________________________")

In [None]:
# with open("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/test_prompt_set.json", "w") as f:
#     json.dump(test_prompt_set, f)

In [None]:
# with open("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/actual_rating.json", "w") as f:
#     json.dump(actual_ratings, f)


## Loading the Test Dataset

In [None]:
import pandas as pd
import json

In [None]:
with open("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/test_prompt_set.json", "r") as f:
    test_prompt_set_new = json.load(f)

In [None]:
print(test_prompt_set_new[0])

<bos>
<start_of_turn>user
Given below is a user's profile and a target movie. The profile contains up to 10 movies the user has rated.
Your task is to predict the user's rating for the target movie.

User Profile:
Movie name: "Cold Comfort Farm (1995)", Movie Genre: Comedy Rating: 5,
Movie name: "Rumble in the Bronx (1995)", Movie Genre: Action, Adventure, Crime Rating: 4,
Movie name: "Ponette (1996)", Movie Genre: Drama Rating: 4,
Movie name: "That Thing You Do! (1996)", Movie Genre: Comedy Rating: 5,
Movie name: "Contact (1997)", Movie Genre: Drama, Sci-Fi Rating: 3,
Movie name: "Bogus (1996)", Movie Genre: Children's, Drama, Fantasy Rating: 1,
Movie name: "Saint, The (1997)", Movie Genre: Action, Romance, Thriller Rating: 3,
Movie name: "Fly Away Home (1996)", Movie Genre: Adventure, Children's Rating: 5,
Movie name: "Men in Black (1997)", Movie Genre: Action, Adventure, Comedy, Sci-Fi Rating: 5,
Movie name: "Postino, Il (1994)", Movie Genre: Drama, Romance Rating: 5

Target movie: 

In [None]:
# Loading the trained model weights.
gemma_lm.backbone.load_lora_weights("/content/drive/MyDrive/Capstone_project_data/RecSys_Gemma_Inst_model_All_DP_2_epoch_plus_epoch_02.lora.h5")

In [None]:
gemma_lm.generate(test_prompt_set_new[5])

In [None]:
# Prompting the model on test prompts

trained_model_outputs_4_epochs = []

for prompt in test_prompt_set_new:
  output = gemma_lm.generate(prompt, max_length=1000)
  print(f"Output: \n{output}\n___________________________________________")
  trained_model_outputs_4_epochs.append(output)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Target movie: Client, The (1994), target movie genre: Drama, Mystery, Thriller
Predict the rating the user will give the target movie by selecting a number between 0-5.
What will the user rate the target movie "Client, The (1994)"?
<end_of_turn>
<start_of_turn>model
3
<end_of_turn>
___________________________________________
Output: 
<bos>
<start_of_turn>user
Given below is a user's profile and a target movie. The profile contains up to 10 movies the user has rated.
Your task is to predict the user's rating for the target movie.

User Profile:
Movie name: "American President, The (1995)", Movie Genre: Comedy, Drama, Romance Rating: 5,
Movie name: "Sound of Music, The (1965)", Movie Genre: Musical Rating: 5,
Movie name: "Birds, The (1963)", Movie Genre: Horror Rating: 4,
Movie name: "Jerry Maguire (1996)", Movie Genre: Drama, Romance Rating: 3,
Movie name: "Terminator 2: Judgment Day (1991)", Movie Genre: Action, Sci-Fi, T

In [None]:
# Saving the model outputs
with open("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/trained_model_outputs_4_epochs.json", "w") as f:
    json.dump(trained_model_outputs_4_epochs, f)

In [None]:
# Loading the model outputs

with open("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/trained_model_outputs_4_epochs.json", "r") as f:
    trained_model_outputs_4_epochs = json.load(f)

In [None]:
# # Loading the model outputs

# with open("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/trained_model_outputs_4_epochs.json", "r") as f:
#     trained_model_outputs = json.load(f)

In [None]:
# Loading the actual rating for the test data
with open("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/actual_rating.json", "r") as f:
    actual_ratings = json.load(f)

In [None]:
actual_ratings[5]

4

In [None]:
trained_model_outputs_4_epochs[0]

'<bos>\n<start_of_turn>user\nGiven below is a user\'s profile and a target movie. The profile contains up to 10 movies the user has rated.\nYour task is to predict the user\'s rating for the target movie.\n\nUser Profile:\nMovie name: "Cold Comfort Farm (1995)", Movie Genre: Comedy Rating: 5,\nMovie name: "Rumble in the Bronx (1995)", Movie Genre: Action, Adventure, Crime Rating: 4,\nMovie name: "Ponette (1996)", Movie Genre: Drama Rating: 4,\nMovie name: "That Thing You Do! (1996)", Movie Genre: Comedy Rating: 5,\nMovie name: "Contact (1997)", Movie Genre: Drama, Sci-Fi Rating: 3,\nMovie name: "Bogus (1996)", Movie Genre: Children\'s, Drama, Fantasy Rating: 1,\nMovie name: "Saint, The (1997)", Movie Genre: Action, Romance, Thriller Rating: 3,\nMovie name: "Fly Away Home (1996)", Movie Genre: Adventure, Children\'s Rating: 5,\nMovie name: "Men in Black (1997)", Movie Genre: Action, Adventure, Comedy, Sci-Fi Rating: 5,\nMovie name: "Postino, Il (1994)", Movie Genre: Drama, Romance Ratin

In [None]:
import re


# Extracting the predicted rating from the model output.
preds = []
for output in trained_model_outputs_4_epochs:
  pred_rating = re.search(r"<start_of_turn>model\s*(\d+)\s*<end_of_turn>", output)

  if pred_rating:
      predicted_rating = int(pred_rating.group(1))
  else:
      print("Predicted rating not found.")
  preds.append(predicted_rating)

In [None]:
preds

[5,
 1,
 4,
 5,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 1,
 4,
 5,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 3,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 3,
 5,
 4,
 4,
 4,
 5,
 5,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 5,
 5,
 4,
 4,
 1,
 4,
 4,
 3,
 4,
 4,
 3,
 3,
 5,
 4,
 4,
 5,
 4,
 5,
 4,
 3,
 4,
 4,
 1,
 4,
 4,
 4,
 4,
 5,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 3,
 5,
 4,
 3,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 5,
 4,
 4,
 4,
 5,
 4,
 4,
 3,
 4,
 4,
 5,
 4,
 5,
 4,
 1,
 3,
 4,
 4,
 4,
 1,
 3,


In [None]:
# # Load later
# with open("/content/drive/MyDrive/Capstone_project_data/MovieLens_Data/actual_rating.json", "r") as f:
#     act_ratings = json.load(f)

In [None]:
# actual_ratings

In [None]:
import numpy as np

actual = np.array(actual_ratings)
predicted = np.array(preds)

# RMSE
rmse = np.sqrt(np.mean((actual - predicted) ** 2))
print(f"RMSE: {rmse:.4f}")

mae = np.mean(np.abs(actual - predicted))
print(f"MAE: {mae:.5f}")

RMSE: 1.1625
MAE: 0.81934
