In [None]:
# Data processing
import pandas as pd
import numpy as np
# import scipy.stats

# Visualization
import seaborn as sb

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

# # Other
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [None]:
ratings = pd.read_csv("../archive/rating_complete.csv")
ratings

In [None]:
anime_df = pd.read_csv("../archive/anime.csv")
anime_df["anime_id"] = anime_df["MAL_ID"]
anime_df = anime_df.drop(columns={"MAL_ID","English name","Aired","Premiered","Producers","Licensors","Studios","Source","Rating","Ranked", "Japanese name"})
anime_df

In [None]:
df_merged = pd.merge(ratings, anime_df, on="anime_id", how="inner")
df = df_merged

In [None]:
df=df.sample(frac=0.001)
df

In [None]:
# Import required libraries

import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow_recommenders as tfrs


In [None]:
df = df.drop(columns={"Name","Genres","Duration","Type"})
# df_one_hot = pd.get_dummies(df)
# df_one_hot

In [None]:
df.dtypes
# df["Score"]
df = df.drop(columns={"Score", "Episodes", "Score-10","Score-9","Score-8","Score-7","Score-6","Score-5","Score-4","Score-3","Score-2","Score-1"})
df

### Visualization

In [None]:
import seaborn as sns
plt.figure(figsize=(12,10))
plt.title('Correlation of Movie Features\n', fontsize=18, weight=600, color='#333d29')
sns.heatmap(df.corr(), annot=True, cmap=['#004346', '#036666', '#06837f', '#02cecb', '#b4ffff', '#f8e16c', '#fed811', '#fdc100'])

In [None]:
df['user_id'].unique().astype(str)
# map
# tf.convert_to_tensor(df['user_id'].unique())

df_NN = df.drop(columns={"Popularity", "Members", "Favorites", "Watching", "Completed", "On-Hold", "Dropped", "Plan to Watch"})

In [None]:
df_NN

### Predictions

In [None]:
y = df_NN["rating"]
# X = df_NN.loc[:, df.columns != "rating"] # All columns besides 'charges'
X = df_NN.drop(columns={"rating"})
# X

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X), len(X_train), len(X_test)

In [None]:
X_train.info

In [None]:
animes_ds = tf.data.Dataset.from_tensor_slices(df_NN["anime_id"].values)
# list(dataset.as_numpy_iterator())
# target = df_NN.pop('rating')
# dataset = tf.data.Dataset.from_tensor_slices(dict(df_NN))
dataset = tf.data.Dataset.from_tensor_slices(df_NN[['user_id','anime_id',"rating"]].values.astype('int32'))
dataset = dataset.map(lambda x: {"user_id": x[0],"anime_id": x[1],"rating": x[2]})

unique_anime_ids = np.unique(np.concatenate(list(tf.data.Dataset.from_tensor_slices(df_NN[['anime_id']].values.astype('int32')).batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(tf.data.Dataset.from_tensor_slices(df_NN[['user_id']].values.astype('int32')).batch(1_000))))

for i in dataset:
    print(i["anime_id"])
    break

# print(len(unique_anime_ids),len(unique_user_ids))
# len(tf.convert_to_tensor(df['user_id'].unique())),len(tf.convert_to_tensor(df['anime_id'].unique()))

In [None]:
# Model with embeddings

user_lookup = tf.keras.layers.StringLookup(vocabulary=tf.convert_to_tensor(df['user_id'].unique().astype(str)), mask_token=None)
user_embedding = tf.keras.layers.Embedding(input_dim=user_lookup.vocabulary_size(), output_dim=32)


user_model = tf.keras.Sequential([user_lookup, user_embedding])

movie_lookup = tf.keras.layers.StringLookup(vocabulary=tf.convert_to_tensor(df['anime_id'].unique().astype(str)), mask_token=None)
movie_embedding = tf.keras.layers.Embedding(input_dim=movie_lookup.vocabulary_size(), output_dim=32)

movie_model = tf.keras.Sequential([movie_lookup, movie_embedding])


In [None]:
class UserModel(tf.keras.Model):

  def __init__(self, use_timestamps):
    super().__init__()

    self._use_timestamps = use_timestamps

    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.IntegerLookup(vocabulary=df['user_id'].unique().astype('int32')),
        # tf.keras.layers.StringLookup(
        #     vocabulary=tf.convert_to_tensor(df['user_id'].unique().astype(str)), mask_token=None),
        tf.keras.layers.Embedding(len(df["user_id"].unique()) + 1, 32),
    ])


  def call(self, inputs):
    if not self._use_timestamps:
      return self.user_embedding(inputs["user_id"])

    return tf.concat([
        self.user_embedding(inputs["user_id"]),
    ], axis=1)

In [None]:
class MovieModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      # tf.keras.layers.StringLookup(
      #     vocabulary=tf.convert_to_tensor(df['anime_id'].unique().astype(str)), mask_token=None),
      tf.keras.layers.IntegerLookup(vocabulary=df['anime_id'].unique().astype('int32')),
      tf.keras.layers.Embedding(len(df["anime_id"].unique()) + 1, 32)
    ])

    # self.title_vectorizer = tf.keras.layers.TextVectorization(
    #     max_tokens=max_tokens)

    self.title_text_embedding = tf.keras.Sequential([
      # self.title_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    # self.title_vectorizer.adapt(movies)

  def call(self, titles):
    return tf.concat([
        self.title_embedding(titles),
        # self.title_text_embedding(titles),
    ], axis=1)

In [None]:
import tensorflow_datasets as tfds

class MovielensModel(tfrs.models.Model):

  def __init__(self, use_timestamps):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      UserModel(False),
      tf.keras.layers.Dense(64)
    ])
    self.candidate_model = tf.keras.Sequential([
      MovieModel(),
      tf.keras.layers.Dense(64)
    ])
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=animes_ds.batch(128).map(self.candidate_model),
        ),
    )
 
  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    # print("FEATURES\n" , features)
    query_embeddings = self.query_model({
        "user_id": features["user_id"],
        # "timestamp": features["timestamp"],
    })
    movie_embeddings = self.candidate_model(features["anime_id"])

    return self.task(query_embeddings, movie_embeddings)

In [None]:
tf.random.set_seed(42)


shuffled = dataset.shuffle(100_000, seed=42, reshuffle_each_iteration=False)
train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [None]:
# UserModel.user_embedding("29806")

In [None]:
import logging
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # FATAL
logging.getLogger('tensorflow').setLevel(logging.FATAL)

model = MovielensModel(use_timestamps=False)

# model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.compile(optimizer=tf.keras.optimizers.Adam()) # worse (maybe bc default lr 0.01 vs 0.1?)

model.fit((cached_train), epochs=3)

train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

In [None]:
# Model

class AnimeModel(tfrs.models.Model):
    def __init__(self) -> None  :
        super().__init__()

        embedding_dimension = 64

        self.anime_layers: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=tf.convert_to_tensor(df['anime_id'].unique().astype(str)), mask_token=None),
            tf.keras.layers.Embedding(len(df['anime_id'].unique()) + 1, embedding_dimension)
        ])

        self.user_layers: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=tf.convert_to_tensor(df['user_id'].unique().astype(str)), mask_token=None),
            tf.keras.layers.Embedding(len(df['user_id'].unique()) + 1, embedding_dimension)
        ])

        self.rating_layer = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(1)
        ])
        
        # Tasks

        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics = [tf.keras.metrics.RootMeanSquaredError()],
        )

        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics = tfrs.metrics.FactorizedTopK(candidates=df['user_id'].unique().astype(str).batch(128).map(self.anime_layers))
        )
        
        # Loss weights
        self.rating_weight = 1
        self.retrieval_weight = 1

    def call(self, features) -> tf.Tensor:
            
        user_embeddings = self.user_layers(features["userId"])

        anime_embeddings = self.anime_layers(features["anime_id"])

        return (
            user_embeddings,
            anime_embeddings,
            self.rating_layer(tf.concat([user_embeddings, anime_embeddings], axis=1)),
        )
    

    def compute_loss(self, features, training=False) -> tf.Tensor:
        ratings = features.pop("rating")

        user_embeddings, anime_embeddings, rating_predictions = self(features)

        # We compute the loss for each task.
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        retrieval_loss = self.retrieval_task(user_embeddings, anime_embeddings)

        # And combine them using the loss weights.
        return (self.rating_weight * rating_loss
                + self.retrieval_weight * retrieval_loss)

In [None]:
# model = AnimeModel()
# model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

# cached_train = X_train.shuffle(100_000).batch(1_000).cache()
# cached_test = X_test.batch(1_000).cache()

# model.fit(cached_train, epochs=3)

In [None]:
def predict_movie(user, top_n=3):
    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    # recommends movies out of the entire movies dataset.
    index.index_from_dataset(
      tf.data.Dataset.zip((df['anime_id'].unique().astype(str).batch(100), df['anime_id'].unique().astype(str).batch(100).map(model.movie_model)))
    )

    # Get recommendations.
    _, titles = index(tf.constant([str(user)]))
    
    print('Top {} recommendations for user {}:\n'.format(top_n, user))
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print('{}. {}'.format(i+1, title.decode("utf-8")))

def predict_rating(user, movie):
    trained_movie_embeddings, trained_user_embeddings, predicted_rating = model({
          "userId": np.array([str(user)]),
          "original_title": np.array([movie])
      })
    print("Predicted rating for {}: {}".format(movie, predicted_rating.numpy()[0][0]))

In [None]:
# y_pred = model_4.predict(X_test)
# print(history)
# model_4.evaluate(X_test,y_test)

In [None]:
# print(y_train.median(), y_train.mean())
# print(y_test.median(), y_test.mean())
# print(y_pred.median(), y_pred.mean())

In [None]:
# Plot history (also know as a loss curve or a training curve)
pd.DataFrame(history.history).plot()
plt.ylabel("loss")
plt.xlabel("epochs")

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(X_train["Completed"], y_train, c="b", label="train data")
plt.scatter(X_test["Completed"], y_test, c='g', label="test data")
plt.scatter(X_test["Completed"], tf.squeeze(y_pred), c="r", label="predictions")
plt.ylim(-1, 15)

plt.legend()