**C23-PS046 - AdaJob Recommender System**

This collab file is to train and export the recommender system model. The output of this file is a zipped recommender model file

# Preparation

In [None]:
#install tfrs

!pip install -q tensorflow-recommenders

In [None]:
#import the necessary modules

import tensorflow as tf                   #a library for building ML models
import tensorflow_recommenders as tfrs    #a library for building recommender system models
import numpy as np                        #used to count num of unique user and airdrop jobs in the dataset
import pandas as pd                       #to read csv file into pandas dataframe
import shutil                             #to save the exported model files into a zip
from typing import Dict, Text             #used in the loss computation of the combined model

In [None]:
#read csv files to pandas

df_airdrops = pd.read_csv('/content/Dataset Recommendation System - Task Dataset.csv', dtype={'task_id':'str'})
df_enrollments = pd.read_csv('/content/Dataset Recommendation System - Enrollment Dataset_enroll only.csv',  dtype={'user_id': 'str', 'task_id':'str'})

In [None]:
#check the dataframe

df_enrollments.info()

In [None]:
#convert the dataframe to a MapDataset

enrollments = tf.data.Dataset.from_tensor_slices(dict(df_enrollments)).map(lambda x: {
    "task_title": x["task_title"],
    "user_id": x["user_id"],
})

airdrops = tf.data.Dataset.from_tensor_slices(dict(df_airdrops)).map(lambda x: x["task_title"])

print(type(enrollments))
print(type(airdrops))

In [None]:
#shuffle the data
#621 is the number of data available in the dataset

tf.random.set_seed(42)
shuffled = enrollments.shuffle(621, seed=42, reshuffle_each_iteration=False)
print(len(shuffled))

In [None]:
#divide the shuffled data into train and test set by a ratio of 80:20

train = shuffled.take(497)
test = shuffled.skip(497).take(124)

In [None]:
print(len(train))
print(len(test))

In [None]:
#create arrays of unique airdrop titles and unique user ids to be used as vocabulary for the model's embedding later

airdrop_titles = airdrops.batch(100)
user_ids = enrollments.batch(1000).map(lambda x: x["user_id"])

unique_airdrop_titles = np.unique(np.concatenate(list(airdrop_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

print(unique_airdrop_titles[:10])
#unique_user_ids[:10]

In [None]:
print(type(unique_airdrop_titles))
print(type(unique_user_ids))

In [None]:
print(len(unique_airdrop_titles))
print(len(unique_user_ids))

#Create the model

In [None]:
#set the embedding dimension

embedding_dimension = 32

In [None]:
#define the user model to embed the user ids as the query tower

user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # Additional +1 in embedding is for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

In [None]:
#define the airdrop model to embed the airdrop titles as the candidate tower

airdrop_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_airdrop_titles, mask_token=None),
  # Additional +1 in embedding is for unknown tokens.
  tf.keras.layers.Embedding(len(unique_airdrop_titles) + 1, embedding_dimension)
])

In [None]:
#define the metrics as FactorizedTopK (Top1, Top5, Top10, Top50, Top100)

metrics = tfrs.metrics.FactorizedTopK(
  candidates=airdrops.batch(128).map(airdrop_model)
)

In [None]:
#define task to get loss function

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [None]:
#combine the query tower and candidate tower to build a full model

class AirdropsModel(tfrs.Model):

  def __init__(self, user_model, airdrop_model):
    super().__init__()
    self.airdrop_model: tf.keras.Model = airdrop_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    user_embeddings = self.user_model(features["user_id"])
    positive_airdrop_embeddings = self.airdrop_model(features["task_title"])

    return self.task(user_embeddings, positive_airdrop_embeddings)

# Train the Model

In [None]:
#instantiate the model

model = AirdropsModel(user_model, airdrop_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
#cache both train and test dataset

cached_train = train.shuffle(621).batch(128).cache()
cached_test = test.batch(64).cache()

In [None]:
#train the model

model.fit(cached_train, epochs=100)

In [None]:
#evaluate the model on the test dataset

model.evaluate(cached_test, return_dict=True)

# Get Recommendation Predictions

In [None]:
# Create a model that takes in user id and recommends airdrop jobs out of the entire airdrop jobs.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
  tf.data.Dataset.zip((airdrops.batch(100), airdrops.batch(100).map(model.airdrop_model)))
)

In [None]:
# Get recommendations.
userid = 42
_, titles = index(tf.constant([str(userid)]))
recommended_titles = []

for airdrop in titles:
  print(airdrop)

In [None]:
print(type(titles))

In [None]:
titles = titles.numpy().tolist()
print(type(titles))

In [None]:
print(titles)

In [None]:
df_target_user = df_enrollments[df_enrollments['user_id']==str(userid)]
df_target_user

In [None]:
print(titles[0][0])
new0 = str(titles[0][0]).replace('b\'', '')
new0 = new0.replace('\'', '')
print(new0)


In [None]:
#define a function to filter out recommended airdrop jobs that the user has already enrolled

def delete_enrolled_tasks(userid, titles):
  titles = titles.tolist()
  recommended_titles = []
  for i in range(len(titles[0])):
    has_been_enrolled = False
    cleaned_title = str(titles[0][i]).replace('b\'', '')
    cleaned_title = cleaned_title.replace('\'', '')
    for task_enrolled in df_enrollments[df_enrollments['user_id']==str(userid)].task_title:
      if cleaned_title == task_enrolled:
        has_been_enrolled = True
        break
    if not has_been_enrolled: 
      recommended_titles.append(cleaned_title)
      print(cleaned_title)
      #print('a')
  return recommended_titles
        

#for task_finished in df_enrollments[df_enrollments['user_id']==str(userid)].task_id:
 # print(task_finished)

In [None]:
print(len(titles[0]))

In [None]:
print(titles[0][0])
cleaned_title = str(titles[0][0]).replace('b\'', '')
cleaned_title = cleaned_title.replace('\'', '')
print(cleaned_title)

In [None]:
delete_enrolled_tasks(42,titles)

In [None]:
print(recommended_titles)

In [None]:
print(f"Recommendations for user 42: {recommended_titles[:3]}")

# Export the Model with a SavedModel format

In [None]:
# Export the model.

tf.keras.models.save_model(index, 'content/test_model')
shutil.make_archive('content/modelsavedwithkerasmeta', 'zip', 'content/test_model')