In [6]:
import numpy as np
import pandas as pd
import math
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from numpy import nan
import re
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from numpy import vstack
from sympy.physics.quantum.identitysearch import scipy
from sklearn.metrics.pairwise import linear_kernel

In [7]:
def extract_year(date_str):
    date_str = str(date_str)
    if 'Unknown' in date_str:
        return nan  
    years = re.findall(r'\b(19\d{2}|20\d{2})\b', date_str)
    if len(years) == 2:  
        return (int(years[0]) + int(years[1])) // 2  
    elif years:
        return int(years[0])  
    else:
        return nan  # In case of a parsing error



In [8]:
def preprocess(anime_list): #TODO : Gérer les cas limites 
    anime_list = anime_list.copy()

    ## Dropping columns
    columns_to_keep = ['anime_id', 'Name', 'Genres', 'Synopsis', 'Episodes', 'Aired', 'Studios', 'Duration', 'Rating']
    anime_list = anime_list[columns_to_keep]

    ## Dealing with Genres : use one-hot encoding
    all_genres = set()
    for genres in anime_list['Genres']:
        all_genres.update(genres.split(', '))
    for genre in all_genres:
        anime_list[genre] = anime_list['Genres'].apply(lambda x: 1 if genre in x.split(', ') else 0)
    anime_list.drop(columns=['Genres'], inplace=True)

    ## Dealing with Episodes and Duration : calculate total length
    anime_list['Episodes'] = pd.to_numeric(anime_list['Episodes'], errors='coerce').fillna(0) #case to deal with
    hours = anime_list['Duration'].str.extract(r'(\d+) hr', expand=False).astype(float)
    minutes = anime_list['Duration'].str.extract(r'(\d+) min', expand=False).astype(float)
    hours.fillna(0, inplace=True)
    minutes.fillna(0, inplace=True)
    anime_list['Duration'] = hours * 60 + minutes
    anime_list['Total_Duration'] = anime_list['Duration'] * anime_list['Episodes']
    anime_list.drop(columns=['Episodes'], inplace=True)
    anime_list.drop(columns=['Duration'], inplace=True)

    ## Dealing with Aired => get starting date
    anime_list['Start_Date'] = pd.to_datetime(anime_list['Aired'].str.split(' to ').str[0], errors='coerce')
    anime_list['Aired'] = anime_list['Aired'].apply(extract_year)
    middle_year = anime_list['Aired'].median()
    anime_list['Aired'].fillna(middle_year, inplace=True)
    anime_list['Aired'] = anime_list['Aired'].astype(int)
    
    
  

    ## Dealing with Studios => keep the first (for now)
    anime_list['Studios'] = anime_list['Studios'].str.split(',', expand=True)[0]
    anime_list['Studios'] = anime_list['Studios'].str.strip()

    ## Dealing with Rating => simplify
    anime_list.loc[anime_list['Rating'].str.contains('G'), 'Rating'] = 'All Ages'
    anime_list.loc[anime_list['Rating'].str.contains('PG-13'), 'Rating'] = 'Teen'
    anime_list.loc[anime_list['Rating'].str.contains('PG'), 'Rating'] = 'Children'
    anime_list.loc[anime_list['Rating'].str.contains('Rx'), 'Rating'] = 'Hentai'
    anime_list.loc[anime_list['Rating'].str.contains(r'R\+'), 'Rating'] = 'Adult'
    anime_list.loc[anime_list['Rating'].str.contains('R'), 'Rating'] = 'Young Adult'
    anime_list.loc[~anime_list['Rating'].isin(['All Ages', 'Children', 'Teen', 'Young Adult', 'Hentai', 'Adult']), 'Rating'] = 'Other'

    ## Dealing with synopsis
    #anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower())
    stop_words = set(stopwords.words('english'))
    #anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x  : ' '.join([word for word in x.split() if word not in stop_words]))
    lemmatizer = WordNetLemmatizer()
    #anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    
    anime_list['Synopsis'] = anime_list['Synopsis'].fillna('')
    anime_list['Synopsis'] = anime_list['Synopsis'].str.lower()
    anime_list['Synopsis'] = anime_list['Synopsis'].str.replace(r'[^\w\s]+', '')
    
    #anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x: ' '.join([word for word, pos in pos_tag(word_tokenize(x)) if pos.startswith(('JJ', 'NN', 'VB', 'RB'))]))

    return anime_list

In [10]:
def show(anime_list):
    nbl, nbc = anime_list.shape
    print("\nNombre de lignes :", nbl)
    print("\nNombre de colonnes :", nbc)
    print("\nInfos\n")
    print(anime_list.info())
    print("\nDescribe\n")
    print(anime_list.describe())
    print("\nHead\n")
    print(anime_list.head(40))

In [56]:
fav_anime_list = [21]
anime_list = pd.read_parquet('anime/anime.parquet')
anime_list = preprocess(anime_list)
show(anime_list)


Nombre de lignes : 24903

Nombre de colonnes : 30

Infos

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24903 entries, 0 to 24902
Data columns (total 30 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   anime_id        24903 non-null  int64         
 1   Name            24903 non-null  object        
 2   Synopsis        24903 non-null  object        
 3   Aired           24903 non-null  int32         
 4   Studios         24903 non-null  object        
 5   Rating          24903 non-null  object        
 6   Fantasy         24903 non-null  int64         
 7   Girls Love      24903 non-null  int64         
 8   Sci-Fi          24903 non-null  int64         
 9   Award Winning   24903 non-null  int64         
 10  Romance         24903 non-null  int64         
 11  Supernatural    24903 non-null  int64         
 12  Sports          24903 non-null  int64         
 13  Action          24903 non-null  int64         


In [36]:
anime_list[anime_list['Name'] == '']

Unnamed: 0,anime_id,Name,Synopsis,Aired,Studios,Rating,Boys Love,Slice of Life,Mystery,Girls Love,...,Award Winning,Hentai,Sci-Fi,Ecchi,Supernatural,Horror,Erotica,Adventure,Total_Duration,Start_Date
0,1,Cowboy Bebop,crime timeless year 2071 humanity expanded acr...,1998,Sunrise,Young Adult,0,0,0,0,...,1,0,1,0,0,0,0,0,624.0,1998-04-03
1,5,Cowboy Bebop: Tengoku no Tobira,another day another bounty—such life often unl...,2001,Bones,Young Adult,0,0,0,0,...,0,0,1,0,0,0,0,0,115.0,2001-09-01
2,6,Trigun,vash stampede man 60000000000 bounty head reas...,1998,Madhouse,All Ages,0,0,0,0,...,0,0,1,0,0,0,0,1,624.0,1998-04-01
3,7,Witch Hunter Robin,robin sena powerful craft user drafted stnj—a ...,2002,Sunrise,All Ages,0,0,1,0,...,0,0,0,0,1,0,0,0,650.0,2002-07-03
4,8,Bouken Ou Beet,dark century people suffering rule devil vande...,2004,Toei Animation,All Ages,0,0,0,0,...,0,0,0,0,1,0,0,1,1196.0,2004-09-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24898,55731,Wu Nao Monu,description available anime,2023,UNKNOWN,All Ages,0,1,0,0,...,0,0,0,0,0,0,0,0,0.0,2023-07-04
24899,55732,Bu Xing Si: Yuan Qi,description available anime,2023,UNKNOWN,All Ages,0,0,0,0,...,0,0,0,0,0,0,0,1,0.0,2023-07-27
24900,55733,Di Yi Xulie,description available anime,2023,UNKNOWN,All Ages,0,0,0,0,...,0,0,1,0,0,0,0,1,0.0,2023-07-19
24901,55734,Bokura no Saishuu Sensou,music video song bokura saishuu sensou shannon,2022,UNKNOWN,All Ages,0,0,0,0,...,0,0,0,0,0,0,0,0,3.0,2022-04-23


In [12]:
def recommend_anime(similarities_tab):
    sorted_df = similarities_tab.sort_values(by='similarity', ascending=False)
    #sorted_df = similarities_tab.sort_values(by='total_similarity', ascending=False)
    top_anime_ids = sorted_df.head(30)['anime_id'].tolist()
    recommended_animes = []
    for anime_id in top_anime_ids:
        anime_name = anime_list.loc[anime_list['anime_id'] == anime_id, 'Name'].iloc[0]
        recommended_animes.append({'anime_id': anime_id, 'Name': anime_name})
    return pd.DataFrame(recommended_animes)

In [13]:
### Synopsis

def extract_keywords(anime_ids, anime_list):
    # récupérer tous les synopsis des animes favoris
    fav_anime_synopsis = anime_list.loc[anime_list['anime_id'].isin(anime_ids), 'Synopsis'].tolist()
    # concaténer l'ensemble de ces synopsis
    fav_anime_synopsis = ' '.join(fav_anime_synopsis)
    # récupérer les mots clés
    fav_anime_keywords = fav_anime_synopsis.split()
    fav_anime_keywords = [word.translate(str.maketrans('', '', string.punctuation)).lower() for word in fav_anime_keywords]
    stop_words = set(stopwords.words('english'))
    fav_anime_keywords = [word for word in fav_anime_keywords if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    fav_anime_keywords = [lemmatizer.lemmatize(word) for word in fav_anime_keywords]
    ## add here more advanced analysis ?
    return ' '.join(fav_anime_keywords)


def recommendation_synopsis_based(fav_anime_list, anime_list):
    # Extraction de mots-clés des synopsis des animes favoris
    fav_anime_keywords = extract_keywords(fav_anime_list, anime_list)

    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values

    # Calcul de la similarité cosinus entre les mots-clés générés des animes favoris et les synopsis de tous les autres animes
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix_other_anime = tfidf_vectorizer.fit_transform(anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'Synopsis'])
    tfidf_matrix_fav_anime = tfidf_vectorizer.transform([fav_anime_keywords]) 
    cosine_similarities = cosine_similarity(tfidf_matrix_other_anime, tfidf_matrix_fav_anime)

    return pd.DataFrame({'anime_id': anime_ids, 'similarity': cosine_similarities.flatten()})


synopsis_cosine_similarities_tab = recommendation_synopsis_based(fav_anime_list, anime_list)
recommended_animes = recommend_anime(synopsis_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

    anime_id                                               Name
0      36215  One Piece: Episode of East Blue - Luffy to 4-n...
1       8740             One Piece Film: Strong World Episode 0
2      38234                       One Piece Movie 14: Stampede
3       5252                      One Piece: Romance Dawn Story
4      19505                                       Kaizoku Ouji
5      12859                                  One Piece Film: Z
6       4155                       One Piece Film: Strong World
7      18315                   Nareuneun Dwaeji - Haejeok Mateo
8      33165                                    Mashou no Nie 3
9       2699                    Uchuu Kaizoku Mito no Daibouken
10      1237  One Piece: Oounabara ni Hirake! Dekkai Dekkai ...
11      2618                                         Takarajima
12     15335  Gintama Movie 2: Kanketsu-hen - Yorozuya yo Ei...
13     25161  One Piece 3D2Y: Ace no shi wo Koete! Luffy Nak...
14     14817              Mouretsu Pirat

In [14]:
  
  tfidf = TfidfVectorizer(stop_words='english')
  tfidf_matrix = tfidf.fit_transform(anime_list['Synopsis'])
  cosine_synopsis = linear_kernel(tfidf_matrix, tfidf_matrix)
  


In [167]:
cosine_synopsis.shape

(24903, 24903)

In [15]:
indices = pd.Series(anime_list.index, index=anime_list['Name']).drop_duplicates()
# For one Anime in the list 
def get_recommendations(title, cosine_sim=cosine_synopsis, suggest_amount=15):
    try:
        idx = indices[title]
        print(idx)
    except KeyError:
        raise ValueError("Anime  not found in matrix.")


    return get_recommendations_by_id(idx,cosine_sim, suggest_amount)


def get_recommendations_by_id(idx, cosine_sim=cosine_synopsis, suggest_amount=15) : 
    sim_scores = list(enumerate(cosine_sim[idx])) 
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    max_amount = len(sim_scores)
    if suggest_amount > max_amount:
        suggest_amount = max_amount
    
    sim_scores = sim_scores[1:suggest_amount]
    
    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]
    
    # Map indices to anime_id
    anime_ids = anime_list.iloc[anime_indices]['anime_id'].values
    
    # Calculate the normalized scores (L2 normalization)
    scores = np.array([i[1] for i in sim_scores])
    norm_scores = scores / np.linalg.norm(scores)
    
    ret_df = pd.DataFrame({
        'Anime' : anime_list.iloc[anime_indices]['Name'] , 
        'Score' : norm_scores , 
        'anime_id': anime_ids , 
        'Position' : range(1 ,  suggest_amount)
    }
    )
    return ret_df

In [28]:
get_recommendations('demon slayer')

24872


Unnamed: 0,Anime,Score,anime_id,Position
2413,"The☆Doraemons: Strange, Sweets, Strange?",0.267261,2628,1
2414,Doraemon and Itchy the Stray,0.267261,2629,2
2417,Doraemon: Time Machine de Oshougatsu,0.267261,2632,3
2418,The☆Doraemons: Dokidoki Kikansha Daibakusou!,0.267261,2633,4
2419,Doraemon: Doraemon Comes Back,0.267261,2634,5
2420,"Dorami-chan: Wow, The Kid Gang of Bandits",0.267261,2636,6
2422,Dorami-chan: A Blue Straw Hat,0.267261,2642,7
2423,Dorami & Doraemons: Space Land's Critical Event,0.267261,2645,8
2430,Doraemon: The Day When I Was Born,0.267261,2652,9
2431,Doraemon: Ganbare! Gian!!,0.267261,2653,10


In [192]:
#Synopis but with another method

#def recommendation_synopsis_based(fav_anime_list, anime_list , cosine_sim=cosine_synopsis, ssuggest_amount = 15):
    # Extraction de mots-clés des synopsis des animes favoris

fav_indices = anime_list[anime_list['anime_id'].isin(fav_anime_list)].index
all_sim_scores = []

# Compute average similarity scores from all provided indices
for idx in fav_indices:
    sim_scores = list(enumerate(cosine_synopsis[idx]))
    print(len(sim_scores))
    all_sim_scores.append(sim_scores)
    
# Calculate the mean of the similarity scores across all provided indices
mean_sim_scores = np.mean(np.array([[score for _, score in item] for item in all_sim_scores]), axis=0)

#print(len(mean_sim_scores))
# Create a list of tuples (index, mean score)
#mean_sim_scores = list(enumerate(mean_sim_scores))

# Sort the scores in descending order
#mean_sim_scores = sorted(mean_sim_scores, key=lambda x: x[1], reverse=True)

#filtered_scores = [(idx, score) for idx, score in mean_sim_scores if idx not in id_list][:]


24903
24903


In [34]:
def get_recommendations_Synopsis_based(fav_anime_list, cosine_sim=cosine_synopsis, suggest_amount=15):
    # Initialize a list to collect all similarity scores
    all_sim_scores = []
    id_list = anime_list[anime_list['anime_id'].isin(fav_anime_list)].index
    # Compute average similarity scores from all provided indices
    for idx in id_list:
        sim_scores = list(enumerate(cosine_sim[idx]))
        all_sim_scores.append(sim_scores)

    # Calculate the mean of the similarity scores across all provided indices
    mean_sim_scores = np.mean(np.array([[score for _, score in item] for item in all_sim_scores]), axis=0)

    # Create a list of tuples (index, mean score)
    mean_sim_scores = list(enumerate(mean_sim_scores))

    # Sort the scores in descending order
    mean_sim_scores = sorted(mean_sim_scores, key=lambda x: x[1], reverse=True)
    
    
    '''
    # Adjust the number of suggestions based on the length of scores
    max_amount = len(mean_sim_scores)
    if suggest_amount > max_amount:
        suggest_amount = max_amount
    '''

    # Exclude the indices that were part of the input to avoid self-recommendation
    filtered_scores = [(idx, score) for idx, score in mean_sim_scores if idx not in id_list]
    

    # Get the anime indices
    anime_indices = [i[0] for i in filtered_scores]

    # Map indices to anime_id
    anime_ids = anime_list.iloc[anime_indices]['anime_id'].values
    
    # Calculate the normalized scores (L2 normalization)
    scores = np.array([i[1] for i in filtered_scores])
    norm_scores = scores / np.linalg.norm(scores)
    
    ret_df = pd.DataFrame({
        'Anime' : anime_list.iloc[anime_indices]['Name'],
        'Score' : norm_scores,
        'anime_id' : anime_ids,
    
    })

    return ret_df
get_recommendations_Synopsis_based([38000])

Unnamed: 0,Anime,Score,anime_id
22047,Kimetsu no Yaiba: Katanakaji no Sato-hen,0.129918,51019
16059,Kimetsu no Yaiba Movie: Mugen Ressha-hen,0.107979,40456
20429,Kimetsu no Yaiba: Yuukaku-hen,0.081337,47778
21532,Kimetsu no Yaiba: Mugen Ressha-hen,0.079096,49926
20152,Chuukou Ikkan!! Kimetsu Gakuen Monogatari: Val...,0.063208,47398
...,...,...,...
24898,Wu Nao Monu,0.000000,55731
24899,Bu Xing Si: Yuan Qi,0.000000,55732
24900,Di Yi Xulie,0.000000,55733
24901,Bokura no Saishuu Sensou,0.000000,55734


In [2]:
get_recommendations('One Piece')

NameError: name 'get_recommendations' is not defined

In [149]:
anime_list[anime_list['Name'] == 'Tokyo Ghoul']

Unnamed: 0,anime_id,Name,Synopsis,Aired,Studios,Rating,Boys Love,Slice of Life,Mystery,Girls Love,...,Award Winning,Hentai,Sci-Fi,Ecchi,Supernatural,Horror,Erotica,Adventure,Total_Duration,Start_Date
8619,22319,Tokyo Ghoul,sinister threat invading tokyo flesheating gho...,2014,Pierrot,Young Adult,0,0,0,0,...,0,0,0,0,0,1,0,0,288.0,2014-07-04


In [101]:


# This code is for debug
fav_anime_list = [1  ]

anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values
similarities = []

Fav_Date =  anime_list.loc[anime_list['anime_id'].isin(fav_anime_list)  , 'Aired' ].mean()
Dates  = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list)  , 'Aired']

    

for Date in Dates :
    similarities.append( abs(Date -  Fav_Date)) 

similarities_array = np.array(similarities).reshape(-1, 1)

 # Créer l'instance de MinMaxScaler
scaler = MinMaxScaler()
    
# Normaliser les similarités
normalized_similarities = scaler.fit_transform(similarities_array)

normalized_similarities

#max(normalized_similarities)


array([[-1.42364847],
       [-1.77263284],
       [-1.30732034],
       ...,
       [ 1.13557029],
       [ 1.01924216],
       [ 1.01924216]])

In [77]:

anime_list[anime_list['anime_id'] == 53164]
    

Unnamed: 0,anime_id,Name,Synopsis,Aired,Studios,Rating,Fantasy,Girls Love,Sci-Fi,Award Winning,...,UNKNOWN,Adventure,Gourmet,Avant Garde,Boys Love,Suspense,Comedy,Slice of Life,Total_Duration,Start_Date
23277,53164,Come On Tonight,music video for the song come on tonight by wa...,2019,UNKNOWN,All Ages,0,0,0,0,...,1,0,0,0,0,0,0,0,2.0,2019-10-04


In [70]:

fav_anime_list = [38000]
anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values
similarities = []
Fav_Date =  anime_list.loc[anime_list['anime_id'].isin(fav_anime_list)  , 'Aired' ].mean()
Dates  = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list)  , 'Aired']

for Date in Dates :
    similarities.append(abs(Date -  Fav_Date)) 

pd1 = pd.DataFrame({'anime_id': anime_ids, 'similarity': similarities})
sorted_df = pd1.sort_values(by='similarity', ascending=False)
#sorted_df = similarities_tab.sort_values(by='total_similarity', ascending=False)
top_anime_ids = sorted_df.head(30)['anime_id'].tolist()
sorted_df

#recommended_animes = []
#for anime_id in top_anime_ids:
    #anime_name = anime_list.loc[anime_list['anime_id'] == anime_id, 'Name'].iloc[0]
    #recommended_animes.append({'anime_id': anime_id, 'Name': anime_name})



Unnamed: 0,anime_id,similarity
15889,40152,0.0
22776,52335,0.0
20526,47908,0.0
22781,52342,0.0
22782,52343,0.0
...,...,...
7626,17387,102.0
9127,24575,102.0
9128,24577,102.0
8833,23183,102.0


In [75]:
## Date (Aired column)
def recommendation_date_based(fav_anime_list, anime_list):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values
    similarities = []

    Fav_Date =  anime_list.loc[anime_list['anime_id'].isin(fav_anime_list)  , 'Aired' ].mean()
    Dates  = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list)  , 'Aired']

    for Date in Dates :
        similarities.append(abs(Date -  Fav_Date)) 
    similarities_array = np.array(similarities).reshape(-1, 1)

    # Créer l'instance de MinMaxScaler
    scaler = MinMaxScaler()
    
    # Normaliser les similarités
    normalized_similarities = scaler.fit_transform(similarities_array)
    
    normalized_similarities  = 1 - normalized_similarities

# Redimensionner pour revenir à une liste simple si nécessaire
    normalized_similarities = normalized_similarities.flatten().tolist()
    return pd.DataFrame({'anime_id': anime_ids, 'similarity': normalized_similarities})

Date_Similarities = recommendation_date_based([38000], anime_list)
recommended_animes = recommend_anime(Date_Similarities)
print(recommended_animes[['anime_id', 'Name']])

    anime_id                                               Name
0      53164                                    Come On Tonight
1      40031                             Mugyutto! Black Clover
2      40033                                          Animagear
3      37509                                                W'z
4      38935                                        Miru Tights
5      38572                                    Assassins Pride
6      37510                                  Mob Psycho 100 II
7      40239         Arifureta Shokugyou de Sekai Saikyou Recap
8      40030                       Tayo-ui Jangnangam Adventure
9      38562  The iDOLM@STER SideM: Wake Atte Mini! - Shoshi...
10     37514          Made in Abyss Movie 1: Tabidachi no Yoake
11     40029                                 Broadcast by Bibby
12     52453                     Koumori Bat wa Good na Shinshi
13     42988                                          New Tribe
14     42985              Ling Long: Inc

In [43]:
### Genre

def recommendation_genre_based(fav_anime_list, anime_list):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values
    similarities = []

    fav_genres = anime_list.loc[anime_list['anime_id'].isin(fav_anime_list), ~anime_list.columns.isin(['anime_id', 'Name', 'Total_Duration', 'Start_Date', 'Studios', 'Rating', 'Synopsis'])].sum()
    fav_genres_prop = fav_genres / fav_genres.sum()

    other_anime_genres = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), ~anime_list.columns.isin(['Name', 'Total_Duration', 'Start_Date', 'Studios', 'Rating', 'Synopsis'])]
    for _, row in other_anime_genres.drop(columns=['anime_id']).iterrows():
        genre_similarity = sum(row[genre] * fav_genres_prop[genre] for genre in fav_genres_prop.index)
        similarities.append(genre_similarity)
       
    return pd.DataFrame({'anime_id': anime_ids, 'similarity': similarities})

genre_cosine_similarities_tab = recommendation_genre_based(fav_anime_list, anime_list)
recommended_animes = recommend_anime(genre_cosine_similarities_tab)
print(genre_cosine_similarities_tab)

       anime_id   similarity
0             1  1995.505121
1             6  1995.505121
2             7  1999.499875
3             8  2001.496877
4            15  2003.494379
...         ...          ...
24896     55731  2020.473145
24897     55732  2020.473395
24898     55733  2020.473895
24899     55734  2019.474394
24900     55735  2019.474394

[24901 rows x 2 columns]


In [8]:
### Duration

def recommendation_duration_based(fav_anime_list, anime_list):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values
    similarities = []
    
    
    
    avg_fav_duration = anime_list.loc[anime_list['anime_id'].isin(fav_anime_list), 'Total_Duration'].mean()
    print(avg_fav_duration)

    other_anime_durations = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'Total_Duration']

    for duration in other_anime_durations:
        relative_difference = abs(duration - avg_fav_duration) / max(duration, avg_fav_duration)
        duration_similarity = 1 - relative_difference
        similarities.append(duration_similarity)

    return pd.DataFrame({'anime_id': anime_ids, 'similarity': similarities})


duration_cosine_similarities_tab = recommendation_duration_based(fav_anime_list, anime_list)
recommended_animes = recommend_anime(duration_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

0.0
    anime_id                                               Name
0          1                                       Cowboy Bebop
1      40665   Shoujo☆Kageki Revue Starlight: Rondo Rondo Rondo
2      40661                 Jashin-chan Dropkick': Chitose-hen
3      40660                                              Start
4      40659  Senki Zesshou Symphogear XV: Senki Zesshou Shi...
5      40658                                          ** Kouhai
6      40656        Girls & Panzer: Saishuushou Part 2 Specials
7      40655                                    Melon no Kirime
8      40651                          Kandagawa Jet Girls Recap
9      40646                             Yes ka No ka Hanbun ka
10     40645                                     Kaigo to Ikiru
11     40644                              Miki no Mikoto (2020)
12     40642                                    Yakusoku (2011)
13     40641                             Little Wonders: Sneeze
14     40639           Tonari no Ie 

  relative_difference = abs(duration - avg_fav_duration) / max(duration, avg_fav_duration)


In [10]:
## Combination

print(genre_cosine_similarities_tab)
print(duration_cosine_similarities_tab)
print(synopsis_cosine_similarities_tab)

combined_tab = pd.merge(genre_cosine_similarities_tab, duration_cosine_similarities_tab, on='anime_id', suffixes=('_genre', '_duration'))
combined_tab = pd.merge(combined_tab, synopsis_cosine_similarities_tab, on='anime_id', suffixes=('', '_synopsis'))
combined_tab['total_similarity'] = 0.2*combined_tab['similarity_genre'] + 0.2*combined_tab['similarity_duration'] + 0.6*combined_tab['similarity']

print(combined_tab)

recommended_animes = recommend_anime(combined_tab)
print(recommended_animes[['anime_id', 'Name']])

       anime_id  similarity
0             1    0.333333
1             5    0.333333
2             6    0.666667
3             7    0.333333
4             8    0.666667
...         ...         ...
24897     55731    0.333333
24898     55732    1.000000
24899     55733    1.000000
24900     55734    0.000000
24901     55735    0.000000

[24902 rows x 2 columns]
       anime_id  similarity
0             1         0.0
1             5         0.0
2             6         0.0
3             7         0.0
4             8         0.0
...         ...         ...
24897     55731         NaN
24898     55732         NaN
24899     55733         NaN
24900     55734         0.0
24901     55735         0.0

[24902 rows x 2 columns]
       anime_id  similarity
0             1    0.013469
1             5    0.026405
2             6    0.027187
3             7    0.005047
4             8    0.031917
...         ...         ...
24897     55731    0.000000
24898     55732    0.000000
24899     55733    0.000