# ANIME DATA ANALYSIS

In [None]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import defaultdict

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Activation, BatchNormalization, Input, Embedding, Dot, Dense, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping

warnings.filterwarnings("ignore")
%matplotlib inline

# Working on animelist.csv

In [None]:
animelist = pd.read_csv('../artifacts/raw/animelist.csv', low_memory=True, usecols = ["user_id","anime_id", "rating"])

In [None]:
animelist.info()

In [None]:
animelist.head()

In [None]:
len(animelist)

# Data Processing

Finding all the Experienced users who have rated more than 299 animes

In [None]:
n_ratings = animelist["user_id"].value_counts()
n_ratings

In [None]:

ratings_df = animelist[animelist["user_id"].isin(n_ratings[n_ratings >= 50].index)]
print(len(ratings_df))

In [None]:
min_rating = min(ratings_df["rating"])
min_rating

In [None]:
max_rating = max(ratings_df["rating"])
max_rating

In [None]:
avg_rating = np.mean(ratings_df["rating"])
avg_rating

Min-Max Scaling the "rating" variable

In [None]:
ratings_df["rating"] = ratings_df["rating"].apply(lambda x: (x - min_rating)/(max_rating - min_rating)).values.astype(np.float64)


In [None]:
ratings_df.duplicated().sum()

In [None]:
ratings_df.isnull().sum()

In [None]:
user_ids = ratings_df["user_id"].unique().tolist()
len(user_ids)

Creating encoders and Decoders for Users and Animes

In [None]:
user2user_encoded = {x:i for i , x in enumerate(user_ids)}
user2user_decoded = {i:x for i , x in enumerate(user_ids)}

In [None]:
print(len(user2user_encoded), len(user2user_decoded))
print(list(user2user_encoded.items())[:5])
print(list(user2user_decoded.items())[:5])

In [None]:
ratings_df["user"] = ratings_df["user_id"].map(user2user_encoded)
ratings_df.head()

In [None]:
anime_ids = ratings_df["anime_id"].unique().tolist()
anime2anime_encoded = {x:i for i , x in enumerate(anime_ids)}
anime2anime_decoded = {i:x for i , x in enumerate(anime_ids)}
ratings_df["anime"] = ratings_df["anime_id"].map(anime2anime_encoded)
ratings_df.head()

In [None]:
print(len(anime2anime_encoded), len(anime2anime_decoded))
print(list(anime2anime_encoded.items())[:5])
print(list(anime2anime_decoded.items())[:5])

Shuffling the data

In [None]:
ratings_df = ratings_df.sample(frac = 1, random_state=43).reset_index(drop=True)
ratings_df.head()

In [None]:
X = ratings_df[["user", "anime"]].values
y = ratings_df["rating"]

In [None]:
test_size = 1000
train_indices = ratings_df.shape[0] - test_size


We will test on only 1000 rows of the data

In [None]:
X_train, X_test, y_train, y_test = (
    X[:train_indices],
    X[train_indices:],
    y[:train_indices],
    y[train_indices:]
)

In [None]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
type(X_train), type(X_test), type(y_train), type(y_test)

We will separate the Users and Animes as two separate arrays within a list

In [None]:
X_train_array = [ X_train[:,0], X_train[:,1]]
X_test_array = [ X_test[:,0], X_test[:,1]]

# Model Architecture

In [None]:
def RecommenderNet():
    embedding_size = 128
    
    user = Input(name="user", shape=[1])
    user_embedding = Embedding(name="user_embedding", input_dim=len(user2user_encoded), output_dim=embedding_size)(user)
    
    anime = Input(name="anime", shape=[1])
    anime_embedding = Embedding(name="anime_embedding", input_dim=len(anime2anime_encoded), output_dim=embedding_size)(anime)
    
    x = Dot(name="dot_product", normalize=True, axes=2)([user_embedding, anime_embedding])
    x = Flatten()(x)
    x = Dense(1, kernel_initializer="he_normal")(x)
    x = BatchNormalization()(x)
    x = Activation("sigmoid")(x)
    
    model = Model(inputs=[user, anime], outputs=x)
    model.compile(loss="binary_crossentropy", optimizer='Adam', metrics=["mae","mse"])
    return model

In [None]:
model = RecommenderNet()
model.summary()

In [None]:
start_lr = 0.00001
min_lr = 0.0001
max_lr = 0.00005
batch_size = 10000

ramup_epochs = 5
sustain_epochs = 0
exp_decay = 0.8

def lrfn(epoch):
    if epoch < ramup_epochs:
        return (max_lr-start_lr)/ramup_epochs*epoch + start_lr
    elif epoch < ramup_epochs+sustain_epochs:
        return max_lr
    else:
        return(max_lr-min_lr)*exp_decay**(epoch-ramup_epochs-sustain_epochs)+min_lr

In [None]:
lr_callback = LearningRateScheduler(lambda epoch:lrfn(epoch), verbose = 0)
checkpoint_filepath = './weights.weights.h5'

model_checkpoint = ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=True, monitor='val_loss', mode='min', save_best_only=True)
early_stopping = EarlyStopping(patience=3, monitor="val_loss", mode="min", restore_best_weights=True)

In [None]:
my_callbacks = [model_checkpoint, lr_callback, early_stopping]

In [None]:
history = model.fit(
    x = X_train_array,
    y = y_train,
    batch_size = batch_size,
    epochs = 20,
    verbose = 1,
    validation_data = (X_test_array, y_test),
    callbacks = my_callbacks
)

In [None]:
metrics = ["loss", "mae", "mse"]
def plot_history(history, metrics):
    plt.figure(figsize=(20, 5))
    for i, metric in enumerate(metrics):
        plt.subplot(1, len(metrics), i + 1)
        plt.plot(history.history[metric], label=metric)
        plt.plot(history.history["val_" + metric], label="val_" + metric)
        plt.title(metric)
        plt.xlabel("Epoch")
        plt.ylabel(metric)
        plt.legend()
    plt.show()
plot_history(history, metrics)    

In [None]:
def extract_weights(name, model):
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    weights = weights/np.linalg.norm(weights, axis=1).reshape((-1,1))
    return weights

In [None]:
anime_weights = extract_weights("anime_embedding", model)
user_weights = extract_weights("user_embedding", model)

# Working on anime.csv

In [None]:
anime = pd.read_csv("../artifacts/raw/anime.csv")
anime.head(2)

In [None]:
anime = anime.replace("Unknown", np.nan)

In [None]:
def get_anime_name(df, anime_id):
    try:
        name = df[df.anime_id == anime_id].eng_version.values[0]
        if name is np.nan:
            name = df[df.anime_id == anime_id].Name.values[0]
    except:
        print(f"Anime id {anime_id} not found")
    return name

In [None]:
anime["anime_id"] = anime["MAL_ID"]
anime["eng_version"] = anime["English name"]
anime["eng_version"] = anime.anime_id.apply(lambda x: get_anime_name(anime, x))
anime.head(2)

In [None]:
anime67 = get_anime_name(anime, 6702)
anime67

In [None]:
anime.sort_values(by=["Score"], inplace=True, ascending=False, kind="quicksort", na_position="last")
anime.head()

In [None]:
anime.columns

In [None]:
anime = anime[["anime_id", "eng_version", "Score", "Genres", "Episodes", "Type", "Premiered", "Members"]]

In [None]:
anime.head(2)

In [None]:
def getAnimeFrame(user_input, df):
    if isinstance(user_input, int):
        return df[df["anime_id"] == user_input]
    elif isinstance(user_input, str):
        return df[df["eng_version"] == user_input]

In [None]:
getAnimeFrame(9253, anime)

In [None]:
getAnimeFrame("Steins;Gate", anime)

# Working on anime_with_synopsis.csv

In [None]:
cols = ["MAL_ID","Name","Genres","sypnopsis"]
synopsis_df = pd.read_csv("../artifacts/raw/anime_with_synopsis.csv", usecols=cols)
synopsis_df.head()

In [None]:
synopsis_df[synopsis_df["Name"] == "Steins;Gate"]

In [None]:
def getSynopsis(user_input, df):
    if isinstance(user_input, int):
        return df[df["MAL_ID"] == user_input].sypnopsis.values[0]
    elif isinstance(user_input, str):
        return df[df["Name"] == user_input].sypnopsis.values[0]

In [None]:
getSynopsis("Steins;Gate", synopsis_df)

# Building Content Based Recommendation

In [None]:
pd.set_option("max_colwidth", None)

In [None]:
def find_similar_anime(name, anime_weights, anime2anime_encoded, anime2anime_decoded, anime_df, synopsis_df, n=5, return_dist=False, neg=False):
    try:
        index = getAnimeFrame(name, anime_df).anime_id.values[0]
        encoded_index = anime2anime_encoded.get(index)
        weights = anime_weights
        dists = np.dot(weights, weights[encoded_index])
        sorted_dists = np.argsort(dists)
        
        n = n+1
        
        if neg:
            closest=sorted_dists[:n]
        else:
            closest=sorted_dists[-n:]
        
        print(f"Anime Closest to {name}")
    
        if return_dist:
            return dists,closest
        
        SimilarityArray = []

        for close in closest:
            try:
                decoded_id = anime2anime_decoded.get(close)
                
                synopsis = getSynopsis(decoded_id, synopsis_df)
                
                anime_frame = getAnimeFrame(decoded_id, anime_df)
                anime_name = anime_frame["eng_version"].values[0]
                genre = anime_frame["Genres"].values[0]
                
                similarity = dists[close]
                
                SimilarityArray.append({"name": anime_name,
                                        "anime_id": decoded_id,
                                        "similarity": similarity,
                                        "genre": genre,
                                        "synopsis": synopsis
                                        })
            except Exception as e:
                # print(f"Error: {e}")
                continue    
        Frame = pd.DataFrame(SimilarityArray).sort_values(by=["similarity"], ascending=False)
        return Frame[Frame.anime_id != index].drop(["anime_id"], axis=1)

    except Exception as e:
        print(f"Error: {e}")

 

In [None]:
find_similar_anime("Steins;Gate", anime_weights, anime2anime_encoded, anime2anime_decoded, anime, synopsis_df)

# User Based Recommendation

In [None]:
def find_similar_user(input_item, user_weights, user2user_encoded, user2user_decoded, n=5, return_dist=False, neg = False):
    try:
        index = input_item
        encoded_index = user2user_encoded.get(index)
        weights = user_weights
        dists = np.dot(weights, weights[encoded_index])
        sorted_dists = np.argsort(dists)
        
        n = n+1
        if neg:
            closest=sorted_dists[:n]
        else:
            closest=sorted_dists[-n:]
        
        print(f"Users Closest to {input_item}")
    
        if return_dist:
            return dists,closest
        
        SimilarityArray = []

        for close in closest:
            try:
                similarity = dists[close]
                
                if isinstance(input_item, int):
                    decoded_id = user2user_decoded.get(close)
                    SimilarityArray.append({
                        "similar_users": decoded_id,
                        "similarity": similarity
                    })

            except Exception as e:
                print(f"Error: {e}")
                continue    
        similar_users = pd.DataFrame(SimilarityArray).sort_values(by=["similarity"], ascending=False)
        similar_users = similar_users[similar_users["similar_users"] !=index]
        return similar_users
    
    except Exception as e:
        print(f"Error: {e}")

In [None]:
find_similar_user(int(11880), user_weights, user2user_encoded, user2user_decoded, n=5)

In [None]:
def showWordCloud(all_genres):
    genre_cloud = WordCloud(width=800, height=400, background_color='white', colormap='gnuplot').generate_from_frequencies(all_genres)
    plt.figure(figsize=(15, 8))
    plt.imshow(genre_cloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
def getFavGenre(frame, plot=False):
    frame.dropna(inplace=False)
    all_genres = defaultdict(int)

    genres_list = []
    for genres in frame["Genres"]:
        if isinstance(genres, str):
            for genre in genres.split(","):
                genres_list.append(genre)
                all_genres[genre.strip()] += 1
    if plot:
        showWordCloud(all_genres)
    return genres_list

In [None]:
x = getAnimeFrame(1,anime)
getFavGenre(x, plot=True)

In [None]:
ratings_df.head()

In [None]:
def get_user_preferences(user_id, ratings_df, verbose=0, plot=False):

    animes_watched_by_user = ratings_df[ratings_df["user_id"] == user_id]