In [1]:
import numpy as np
import pandas as pd
import math
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from numpy import nan
import re
from numpy import vstack
from sympy.physics.quantum.identitysearch import scipy
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import words

In [2]:
def extract_year(date_str):
    date_str = str(date_str)
    if 'Unknown' in date_str:
        return nan  
    years = re.findall(r'\b(19\d{2}|20\d{2})\b', date_str)
    if len(years) == 2:  
        return (int(years[0]) + int(years[1])) // 2  
    elif years:
        return int(years[0])  
    else:
        return nan  # In case of a parsing error

In [3]:
def preprocess(anime_list): 
    anime_list = anime_list.copy()

    ## Dropping columns
    columns_to_keep = ['anime_id', 'Name', 'Genres', 'Synopsis', 'Episodes', 'Aired', 'Studios', 'Duration', 'Rating', 'Type', 'Source']
    anime_list = anime_list[columns_to_keep]

    ## Dealing with Genres : use one-hot encoding 
    all_genres = set()
    for genres in anime_list['Genres']:
        all_genres.update(genres.split(', '))
    for genre in all_genres:
        anime_list["Genre " +genre] = anime_list['Genres'].apply(lambda x: 1 if genre in x.split(', ') else 0)
    anime_list.drop(columns=['Genres'], inplace=True)

    ## Dealing with Episodes and Duration : calculate total length
    anime_list['Episodes'] = pd.to_numeric(anime_list['Episodes'], errors='coerce').fillna(0) #0 if UNKNOWN episodes
    hours = anime_list['Duration'].str.extract(r'(\d+) hr', expand=False).astype(float)
    minutes = anime_list['Duration'].str.extract(r'(\d+) min', expand=False).astype(float)
    hours.fillna(0, inplace=True)
    minutes.fillna(0, inplace=True)
    anime_list['Duration'] = hours * 60 + minutes #0 if UNKNOWN duration
    anime_list['Total_Duration'] = anime_list['Duration'] * anime_list['Episodes']

    ## Dealing with Aired
    anime_list['Aired'] = anime_list['Aired'].apply(extract_year)
    middle_year = anime_list['Aired'].median()
    anime_list['Aired'] = anime_list['Aired'].fillna(middle_year)
    anime_list['Aired'] = anime_list['Aired'].astype(int)

    ## Dealing with Rating => use one-hot encoding
    all_ratings = set()
    for rating in anime_list['Rating']:
        all_ratings.update(rating.split(', '))
    for rating in all_ratings:
        anime_list["Rating " + rating] = anime_list['Rating'].apply(lambda x: 1 if rating in x.split(', ') else 0)
    anime_list.drop(columns=['Rating'], inplace=True)

    ## Dealing with Type => use one-hot encoding
    all_types = set()
    for type in anime_list['Type']:
        all_types.update(type.split(', '))
    for type in all_types:
        anime_list["Type " + type] = anime_list['Type'].apply(lambda x: 1 if type in x.split(', ') else 0)
    anime_list.drop(columns=['Type'], inplace=True)

    ## Dealing with Source => use one-hot encoding
    all_sources = set()
    for source in anime_list['Source']:
        all_sources.update(source.split(', '))
    for source in all_sources:
        anime_list["Source " + source] = anime_list['Source'].apply(lambda x: 1 if source in x.split(', ') else 0)
    anime_list.drop(columns=['Source'], inplace=True)

    ## Dealing with synopsis
    anime_list['Synopsis'] = anime_list['Synopsis'].str.replace(r'[^\w\s]+', '')
    anime_list['Synopsis'] = anime_list['Synopsis'].str.replace('No description available for this anime.', '')
    anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower())
    stop_words = set(stopwords.words('english'))
    anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x  : ' '.join([word for word in x.split() if word not in stop_words]))
    lemmatizer = WordNetLemmatizer()
    anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    return anime_list

In [4]:
def show(anime_list):
    nbl, nbc = anime_list.shape
    print("\nNombre de lignes :", nbl)
    print("\nNombre de colonnes :", nbc)
    print("\nInfos\n")
    print(anime_list.info())
    print("\nDescribe\n")
    print(anime_list.describe())
    print("\nHead\n")
    print(anime_list.head(40))

In [5]:
#fav_anime_list = [21, 16498, 31964, 38000, 136]
fav_anime_list = [38000]
anime_list = pd.read_parquet('anime/anime.parquet')
anime_list = preprocess(anime_list)
show(anime_list)


Nombre de lignes : 24903

Nombre de colonnes : 61

Infos

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24903 entries, 0 to 24902
Data columns (total 61 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   anime_id                               24903 non-null  int64  
 1   Name                                   24903 non-null  object 
 2   Synopsis                               24903 non-null  object 
 3   Episodes                               24903 non-null  float64
 4   Aired                                  24903 non-null  int32  
 5   Studios                                24903 non-null  object 
 6   Duration                               24903 non-null  float64
 7   Genre Erotica                          24903 non-null  int64  
 8   Genre Award Winning                    24903 non-null  int64  
 9   Genre Boys Love                        24903 non-null  int64  
 10  Genre Girls

In [6]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(anime_list['Synopsis'])
cosine_synopsis = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
def adjust_dispersion(df, factor=0.25):
    ## Update df, which have values between 0 and 1, to adjust dispersion relatively to 0.5 to a fixed factor, while keeping the values between 0 and 1

    # Calculate the current mean absolute deviation from 0.5
    current_mad = np.abs(df['similarity'] - 0.5).mean()
    
    # Scale the values to achieve the desired dispersion relative to 0.5
    scaled_values = df['similarity'] + (0.5 - df['similarity']) * (factor / current_mad)
    
    # Ensure values are between 0 and 1
    scaled_values = np.clip(scaled_values, 0, 1)
    
    df['similarity'] = scaled_values
    
    return df

In [8]:
def recommend_anime(similarities_tab):
    sorted_df = similarities_tab.sort_values(by='similarity', ascending=False)
    top_anime_ids = sorted_df.head(30)['anime_id'].tolist()
    recommended_animes = []
    for anime_id in top_anime_ids:
        anime_name = anime_list.loc[anime_list['anime_id'] == anime_id, 'Name'].iloc[0]
        recommended_animes.append({'anime_id': anime_id, 'Name': anime_name})
    return pd.DataFrame(recommended_animes)

In [9]:
def recommend_anime_global(similarities_tab):
    sorted_df = similarities_tab.sort_values(by='total_similarity', ascending=False)
    top_anime_ids = sorted_df.head(30)['anime_id'].tolist()
    recommended_animes = []
    for anime_id in top_anime_ids:
        anime_name = anime_list.loc[anime_list['anime_id'] == anime_id, 'Name'].iloc[0]
        recommended_animes.append({'anime_id': anime_id, 'Name': anime_name})
    return pd.DataFrame(recommended_animes)

In [10]:
def recommendation_synopsis_based(fav_anime_list, anime_list, cosine_sim=cosine_synopsis):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values
    
    if not fav_anime_list:
        return pd.DataFrame({'anime_id': anime_ids, 'similarity': 0})

    # Initialize a list to collect all similarity scores
    all_sim_scores = []
    id_list = anime_list[anime_list['anime_id'].isin(fav_anime_list)].index
    # Compute average similarity scores from all provided indices
    for idx in id_list:
        sim_scores = list(enumerate(cosine_sim[idx]))
        all_sim_scores.append(sim_scores)

    # Calculate the mean of the similarity scores across all provided indices
    mean_sim_scores = np.mean(np.array([[score for _, score in item] for item in all_sim_scores]), axis=0)

    # Create a list of tuples (index, mean score)
    mean_sim_scores = list(enumerate(mean_sim_scores))

    # Exclude the indices that were part of the input to avoid self-recommendation
    filtered_scores = [(idx, score) for idx, score in mean_sim_scores if idx not in id_list][:]

    # Get the anime indices
    anime_indices = [i[0] for i in filtered_scores]

    # Map indices to anime_id
    anime_ids = anime_list.iloc[anime_indices]['anime_id'].values
    
    # Calculate the normalized scores (L2 normalization)
    scores = np.array([i[1] for i in filtered_scores])
    norm_scores = scores / np.linalg.norm(scores)

    return pd.DataFrame({'anime_id': anime_ids, 'similarity': norm_scores})

synopsis_cosine_similarities_tab = recommendation_synopsis_based(fav_anime_list, anime_list)
print(synopsis_cosine_similarities_tab)

synopsis_cosine_similarities_tab = adjust_dispersion(synopsis_cosine_similarities_tab)

print(synopsis_cosine_similarities_tab)
recommended_animes = recommend_anime(synopsis_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

       anime_id  similarity
0             1    0.008726
1             5    0.009855
2             6    0.005594
3             7    0.002020
4             8    0.004063
...         ...         ...
24897     55731    0.000000
24898     55732    0.000000
24899     55733    0.000000
24900     55734    0.000000
24901     55735    0.000000

[24902 rows x 2 columns]
       anime_id  similarity
0             1    0.255881
1             5    0.256442
2             6    0.254324
3             7    0.252549
4             8    0.253564
...         ...         ...
24897     55731    0.251545
24898     55732    0.251545
24899     55733    0.251545
24900     55734    0.251545
24901     55735    0.251545

[24902 rows x 2 columns]
    anime_id                                               Name
0      51019           Kimetsu no Yaiba: Katanakaji no Sato-hen
1      22137         Toukiden: The Age of Demons - Introduction
2      49926                 Kimetsu no Yaiba: Mugen Ressha-hen
3      47778        

In [11]:
def recommendation_date_based(fav_anime_list, anime_list):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values
    similarities = []

    fav_date =  anime_list.loc[anime_list['anime_id'].isin(fav_anime_list)  , 'Aired' ].mean()
    dates  = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list)  , 'Aired']

    for date in dates :
        similarities.append(abs(date -  fav_date))
    similarities_array = np.array(similarities).reshape(-1, 1)

    # Créer l'instance de MinMaxScaler
    scaler = MinMaxScaler()
    
    # Normaliser les similarités
    normalized_similarities = scaler.fit_transform(similarities_array)
    
    normalized_similarities  = 1 - normalized_similarities

    # Redimensionner pour revenir à une liste simple si nécessaire
    normalized_similarities = normalized_similarities.flatten().tolist()
    return pd.DataFrame({'anime_id': anime_ids, 'similarity': normalized_similarities})

date_cosine_similarities_tab = recommendation_date_based(fav_anime_list, anime_list)
date_cosine_similarities_tab = adjust_dispersion(date_cosine_similarities_tab)

print(date_cosine_similarities_tab)

recommended_animes = recommend_anime(date_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

       anime_id  similarity
0             1    0.606187
1             5    0.616806
2             6    0.606187
3             7    0.620346
4             8    0.627425
...         ...         ...
24897     55731    0.666360
24898     55732    0.666360
24899     55733    0.666360
24900     55734    0.669900
24901     55735    0.669900

[24902 rows x 2 columns]
    anime_id                                               Name
0      53164                                    Come On Tonight
1      40031                             Mugyutto! Black Clover
2      40033                                          Animagear
3      37509                                                W'z
4      38935                                        Miru Tights
5      38572                                    Assassins Pride
6      37510                                  Mob Psycho 100 II
7      40239         Arifureta Shokugyou de Sekai Saikyou Recap
8      40030                       Tayo-ui Jangnangam Adventur

In [12]:
### Genre
##Limit case : check that every anime in fav_anime_list has not Genre_UNKNOWN, if not delete this anime from the list to build fav_genres_prop

def recommendation_genre_based(fav_anime_list, anime_list):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values

    if not fav_anime_list:
        return pd.DataFrame({'anime_id': anime_ids, 'similarity': 0})
    
    similarities = []

    fav_genres = anime_list.loc[anime_list['anime_id'].isin(fav_anime_list), anime_list.filter(regex='^Genre').columns].sum()
    fav_genres_prop = fav_genres / fav_genres.sum()

    other_anime_genres = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), anime_list.filter(regex='^Genre').columns]
    for _, row in other_anime_genres.iterrows():
        genre_similarity = sum(row[genre] * fav_genres_prop[genre] for genre in fav_genres_prop.index)
        similarities.append(genre_similarity)
       
    return pd.DataFrame({'anime_id': anime_ids, 'similarity': similarities})

fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Genre UNKNOWN'].values[0] == 0]

genre_cosine_similarities_tab = recommendation_genre_based(fav_anime_list, anime_list)
genre_cosine_similarities_tab = adjust_dispersion(genre_cosine_similarities_tab)

print(genre_cosine_similarities_tab)

recommended_animes = recommend_anime(genre_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

       anime_id  similarity
0             1    0.558664
1             5    0.441336
2             6    0.441336
3             7    0.441336
4             8    0.441336
...         ...         ...
24897     55731    0.441336
24898     55732    0.558664
24899     55733    0.558664
24900     55734    0.324009
24901     55735    0.324009

[24902 rows x 2 columns]
    anime_id                                               Name
0        121                                Fullmetal Alchemist
1      40748                                     Jujutsu Kaisen
2      37623    Chiisana Eiyuu: Kani to Tamago to Toumei Ningen
3      50410                                One Piece Film: Red
4        164                                      Mononoke Hime
5        528              Pokemon Movie 01: Mewtwo no Gyakushuu
6       1033                                       Sennen Joyuu
7       2782                                        Fire Emblem
8       2786                                         Bakegyamo

In [13]:
### Rating
##Limit case : check that every anime in fav_anime_list has not Rating UNKNOWN, if not delete this anime from the list to build fav_ratings_prop

def recommendation_rating_based(fav_anime_list, anime_list):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values

    if not fav_anime_list:
        return pd.DataFrame({'anime_id': anime_ids, 'similarity': 0})
    
    similarities = []

    fav_ratings = anime_list.loc[anime_list['anime_id'].isin(fav_anime_list), anime_list.filter(regex='^Rating').columns].sum()
    fav_ratings_prop = fav_ratings / fav_ratings.sum()

    print(fav_ratings_prop)

    other_anime_ratings = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), anime_list.filter(regex='^Rating').columns]
    for _, row in other_anime_ratings.iterrows():
        rating_similarity = sum(row[rate] * fav_ratings_prop[rate] for rate in fav_ratings_prop.index)
        similarities.append(rating_similarity)
       
    return pd.DataFrame({'anime_id': anime_ids, 'similarity': similarities})

fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Rating UNKNOWN'].values[0] == 0]

rating_cosine_similarities_tab = recommendation_rating_based(fav_anime_list, anime_list)
rating_cosine_similarities_tab = adjust_dispersion(rating_cosine_similarities_tab)

print(rating_cosine_similarities_tab)

recommended_animes = recommend_anime(rating_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

Rating Rx - Hentai                       0.0
Rating R+ - Mild Nudity                  0.0
Rating PG-13 - Teens 13 or older         0.0
Rating PG - Children                     0.0
Rating G - All Ages                      0.0
Rating UNKNOWN                           0.0
Rating R - 17+ (violence & profanity)    1.0
dtype: float64
       anime_id  similarity
0             1        0.75
1             5        0.75
2             6        0.25
3             7        0.25
4             8        0.25
...         ...         ...
24897     55731        0.25
24898     55732        0.25
24899     55733        0.25
24900     55734        0.25
24901     55735        0.25

[24902 rows x 2 columns]
    anime_id                                               Name
0          1                                       Cowboy Bebop
1      47194                                  Summertime Render
2      52941                                        Gigi Goegoe
3      20187         Genei wo Kakeru Taiyou: Fumikom

In [14]:
### Type 
##Limit case : check that every anime in fav_anime_list has not "Type UNKNOWN", if not delete this anime from the list to build fav_types_prop

def recommendation_type_based(fav_anime_list, anime_list):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values

    if not fav_anime_list:
        return pd.DataFrame({'anime_id': anime_ids, 'similarity': 0})
    
    similarities = []

    fav_types = anime_list.loc[anime_list['anime_id'].isin(fav_anime_list), anime_list.filter(regex='^Type').columns].sum()
    fav_types_prop = fav_types / fav_types.sum()

    other_anime_types = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), anime_list.filter(regex='^Type').columns]
    for _, row in other_anime_types.iterrows():
        type_similarity = sum(row[type] * fav_types_prop[type] for type in fav_types_prop.index)
        similarities.append(type_similarity)
       
    return pd.DataFrame({'anime_id': anime_ids, 'similarity': similarities})

fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Type UNKNOWN'].values[0] == 0]

print(fav_anime_list)

type_cosine_similarities_tab = recommendation_type_based(fav_anime_list, anime_list)
type_cosine_similarities_tab = adjust_dispersion(type_cosine_similarities_tab)

print(type_cosine_similarities_tab)

recommended_animes = recommend_anime(type_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

[38000]
       anime_id  similarity
0             1        0.75
1             5        0.25
2             6        0.75
3             7        0.75
4             8        0.75
...         ...         ...
24897     55731        0.25
24898     55732        0.25
24899     55733        0.25
24900     55734        0.25
24901     55735        0.25

[24902 rows x 2 columns]
    anime_id                                               Name
0          1                                       Cowboy Bebop
1      19111          Love Live! School Idol Project 2nd Season
2      46576                Kuaile Baobei: Duo Mi Chengzhang Ji
3      46575                    Kuaile Baobei: Cheng Chang Riji
4      46574                        Kuaile Baobei: Huanle Jiaqi
5      46573                     Kuaile Baobei: Duocai Shenghuo
6      19221  Ore no Nounai Sentakushi ga, Gakuen Love Comed...
7      46572                       Kuaile Baobei: Xingqu Mofang
8      46571                       Kuaile Baobei: Huol

In [15]:
### Source 
##Limit case : check that every anime in fav_anime_list has not "Source Unknown", if not delete this anime from the list to build fav_sources_prop

def recommendation_source_based(fav_anime_list, anime_list):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values

    if not fav_anime_list:
        return pd.DataFrame({'anime_id': anime_ids, 'similarity': 0})
    
    similarities = []

    fav_sources = anime_list.loc[anime_list['anime_id'].isin(fav_anime_list), anime_list.filter(regex='^Source').columns].sum()
    fav_sources_prop = fav_sources / fav_sources.sum()

    other_anime_sources = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), anime_list.filter(regex='^Source').columns]
    for _, row in other_anime_sources.iterrows():
        source_similarity = sum(row[source] * fav_sources_prop[source] for source in fav_sources_prop.index)
        similarities.append(source_similarity)
       
    return pd.DataFrame({'anime_id': anime_ids, 'similarity': similarities})

fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Source Unknown'].values[0] == 0]
print(fav_anime_list)

source_cosine_similarities_tab = recommendation_source_based(fav_anime_list, anime_list)
source_cosine_similarities_tab = adjust_dispersion(source_cosine_similarities_tab)

print(source_cosine_similarities_tab)

recommended_animes = recommend_anime(source_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

[38000]
       anime_id  similarity
0             1        0.25
1             5        0.25
2             6        0.75
3             7        0.25
4             8        0.75
...         ...         ...
24897     55731        0.25
24898     55732        0.25
24899     55733        0.25
24900     55734        0.25
24901     55735        0.25

[24902 rows x 2 columns]
    anime_id                                               Name
0      37447                                    Karakuri Circus
1      15109                               Cuticle Tantei Inaba
2      15117                Kami nomi zo Shiru Sekai: Tenri-hen
3      51693                Kaminaki Sekai no Kamisama Katsudou
4      40254                                         XL Joushi.
5      40251    Cyborg 009: The Cyborg Soldier - Kami no Ubugoe
6      51705                                   Otonari ni Ginga
7      40250    Cyborg 009: The Cyborg Soldier - Yomi no Gunzou
8      15125                                          

In [16]:
### Duration
## Limit case : check that every anime in fav_anime_list has not UNKNOWN for episodes or duration, if not delete this anime from the list to build avg_fav_duration

def recommendation_duration_based(fav_anime_list, anime_list):

    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values

    if not fav_anime_list:
        return pd.DataFrame({'anime_id': anime_ids, 'similarity': 0})
    
    
    similarities = []

    avg_fav_duration = anime_list.loc[anime_list['anime_id'].isin(fav_anime_list), 'Total_Duration'].mean()

    other_anime_durations = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'Total_Duration']

    for duration in other_anime_durations:
        if duration != 0:
            relative_difference = abs(duration - avg_fav_duration) / max(duration, avg_fav_duration)
            duration_similarity = 1 - relative_difference
        else:
            duration_similarity = 0.5 #similarity equals 0.5 if duration equals 0 (meaning UNKNOW number of episodes or UNKNOW duration)
        similarities.append(duration_similarity)

    return pd.DataFrame({'anime_id': anime_ids, 'similarity': similarities})

#Filtering anime with missing information
fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Episodes'].values[0] != 0]
fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Duration'].values[0] != 0]

duration_cosine_similarities_tab = recommendation_duration_based(fav_anime_list, anime_list)
duration_cosine_similarities_tab = adjust_dispersion(duration_cosine_similarities_tab)
print(duration_cosine_similarities_tab)

recommended_animes = recommend_anime(duration_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

       anime_id  similarity
0             1    0.614393
1             5    0.423205
2             6    0.614393
3             7    0.604825
4             8    0.500000
...         ...         ...
24897     55731    0.500000
24898     55732    0.500000
24899     55733    0.500000
24900     55734    0.376460
24901     55735    0.376460

[24902 rows x 2 columns]
    anime_id                                     Name
0      43888                               26 Ge Mimi
1       1466                         Hakugei Densetsu
2      17505                              Mushibugyou
3       7525                            Kick Off 2002
4       1546                                 Negima!?
5      44035                          Ling Su Zhengba
6      44021                 Dianji Xiaozi 3rd Season
7      44020                 Dianji Xiaozi 2nd Season
8      44019                            Dianji Xiaozi
9       1502         Mahou Shoujo Pretty Sammy (1996)
10       340   Mutsu Enmei Ryuu Gaiden: Shur

In [17]:
def preprocess_fav_anime_list(fav_anime_list, anime_list, feature):
    if feature == 'genre':
        filtered_fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Genre UNKNOWN'].values[0] == 0]
    elif feature == 'duration':
        filtered_fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Episodes'].values[0] != 0 and anime_list.loc[anime_list['anime_id'] == anime_id, 'Duration'].values[0] != 0]
    elif feature == 'type':
        filtered_fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Type UNKNOWN'].values[0] == 0]
    elif feature == 'source':
        filtered_fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Source Unknown'].values[0] == 0]
    elif feature == 'rating':
        filtered_fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Rating UNKNOWN'].values[0] == 0]
    elif feature == 'synopsis':
        filtered_fav_anime_list = [anime_id for anime_id in fav_anime_list if anime_list.loc[anime_list['anime_id'] == anime_id, 'Synopsis'].values[0] != '']
    else:
        filtered_fav_anime_list = fav_anime_list
    return filtered_fav_anime_list

In [18]:
## Final combination
fav_anime_list = [21, 38000, 16498]

genre_cosine_similarities_tab = recommendation_genre_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'genre'), anime_list)
genre_cosine_similarities_tab = adjust_dispersion(genre_cosine_similarities_tab)

duration_cosine_similarities_tab = recommendation_duration_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'duration'), anime_list)
duration_cosine_similarities_tab = adjust_dispersion(duration_cosine_similarities_tab)

synopsis_cosine_similarities_tab = recommendation_synopsis_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'synopsis'), anime_list)
synopsis_cosine_similarities_tab = adjust_dispersion(synopsis_cosine_similarities_tab)

rating_cosine_similarities_tab = recommendation_source_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'rating'), anime_list)
rating_cosine_similarities_tab = adjust_dispersion(rating_cosine_similarities_tab)

type_cosine_similarities_tab = recommendation_type_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'type'), anime_list)
type_cosine_similarities_tab = adjust_dispersion(type_cosine_similarities_tab)

source_cosine_similarities_tab = recommendation_source_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'source'), anime_list)
source_cosine_similarities_tab = adjust_dispersion(source_cosine_similarities_tab)

date_cosine_similarities_tab = recommendation_date_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'date'), anime_list)
date_cosine_similarities_tab = adjust_dispersion(date_cosine_similarities_tab)

combined_tab = pd.merge(genre_cosine_similarities_tab, duration_cosine_similarities_tab, on='anime_id', suffixes=('_genre', '_duration'))
combined_tab = pd.merge(combined_tab, synopsis_cosine_similarities_tab, on='anime_id', suffixes=('_', '_synopsis'))
combined_tab = pd.merge(combined_tab, type_cosine_similarities_tab, on='anime_id', suffixes=('', '_type'))
combined_tab = pd.merge(combined_tab, source_cosine_similarities_tab, on='anime_id', suffixes=('', '_source'))
combined_tab = pd.merge(combined_tab, rating_cosine_similarities_tab, on='anime_id', suffixes=('', '_rating'))
combined_tab = pd.merge(combined_tab, date_cosine_similarities_tab, on='anime_id', suffixes=('', '_date'))

#print(combined_tab)

# Calculate total similarity
combined_tab['total_similarity'] = (
    0.15 * combined_tab['similarity_genre'] +
    0.01 * combined_tab['similarity_duration'] +
    0.80 * combined_tab['similarity'] + #synopsis
    0.01 * combined_tab['similarity_type'] +
    0.01 * combined_tab['similarity_source'] + 
    0.01 * combined_tab['similarity_rating'] +
    0.01 * combined_tab['similarity_date']
)

recommended_animes = recommend_anime_global(combined_tab)
print(recommended_animes[['anime_id', 'Name']])
print(recommended_animes['anime_id'].tolist())


    anime_id                                               Name
0      25777                        Shingeki no Kyojin Season 2
1      50410                                One Piece Film: Red
2      51019           Kimetsu no Yaiba: Katanakaji no Sato-hen
3      36215  One Piece: Episode of East Blue - Luffy to 4-n...
4      12859                                  One Piece Film: Z
5      38234                       One Piece Movie 14: Stampede
6        464  One Piece Movie 06: Omatsuri Danshaku to Himit...
7        121                                Fullmetal Alchemist
8      35760                        Shingeki no Kyojin Season 3
9      38524                 Shingeki no Kyojin Season 3 Part 2
10     36702  Shingeki no Kyojin Season 2 Movie: Kakusei no ...
11      1237  One Piece: Oounabara ni Hirake! Dekkai Dekkai ...
12      5252                      One Piece: Romance Dawn Story
13      4155                       One Piece Film: Strong World
14     42625                        Heio

In [19]:
def final_recommandation(fav_anime_list):
    genre_cosine_similarities_tab = recommendation_genre_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'genre'), anime_list)
    genre_cosine_similarities_tab = adjust_dispersion(genre_cosine_similarities_tab)

    duration_cosine_similarities_tab = recommendation_duration_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'duration'), anime_list)
    duration_cosine_similarities_tab = adjust_dispersion(duration_cosine_similarities_tab)

    synopsis_cosine_similarities_tab = recommendation_synopsis_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'synopsis'), anime_list)
    synopsis_cosine_similarities_tab = adjust_dispersion(synopsis_cosine_similarities_tab)

    rating_cosine_similarities_tab = recommendation_source_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'rating'), anime_list)
    rating_cosine_similarities_tab = adjust_dispersion(rating_cosine_similarities_tab)

    type_cosine_similarities_tab = recommendation_type_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'type'), anime_list)
    type_cosine_similarities_tab = adjust_dispersion(type_cosine_similarities_tab)

    source_cosine_similarities_tab = recommendation_source_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'source'), anime_list)
    source_cosine_similarities_tab = adjust_dispersion(source_cosine_similarities_tab)

    date_cosine_similarities_tab = recommendation_date_based(preprocess_fav_anime_list(fav_anime_list, anime_list, 'date'), anime_list)
    date_cosine_similarities_tab = adjust_dispersion(date_cosine_similarities_tab)

    combined_tab = pd.merge(genre_cosine_similarities_tab, duration_cosine_similarities_tab, on='anime_id', suffixes=('_genre', '_duration'))
    combined_tab = pd.merge(combined_tab, synopsis_cosine_similarities_tab, on='anime_id', suffixes=('_', '_synopsis'))
    combined_tab = pd.merge(combined_tab, type_cosine_similarities_tab, on='anime_id', suffixes=('', '_type'))
    combined_tab = pd.merge(combined_tab, source_cosine_similarities_tab, on='anime_id', suffixes=('', '_source'))
    combined_tab = pd.merge(combined_tab, rating_cosine_similarities_tab, on='anime_id', suffixes=('', '_rating'))
    combined_tab = pd.merge(combined_tab, date_cosine_similarities_tab, on='anime_id', suffixes=('', '_date'))

    # Calculate total similarity
    combined_tab['total_similarity'] = (
        0.15 * combined_tab['similarity_genre'] +
        0.01 * combined_tab['similarity_duration'] +
        0.80 * combined_tab['similarity'] + #synopsis
        0.01 * combined_tab['similarity_type'] +
        0.01 * combined_tab['similarity_source'] + 
        0.01 * combined_tab['similarity_rating'] +
        0.01 * combined_tab['similarity_date']
    )

    print(combined_tab)

    combined_tab = combined_tab[['anime_id', 'total_similarity']]
    combined_tab.rename(columns={'total_similarity': 'similarity'}, inplace=True)

    print(combined_tab)

    recommended_animes = recommend_anime(combined_tab)
    return recommended_animes['anime_id'].tolist()
    

In [20]:
fav_anime_list = [21, 38000, 16498]
print(final_recommandation(fav_anime_list))


       anime_id  similarity_genre  similarity_duration  similarity  \
0             1          0.500000             0.614902    0.255973   
1             5          0.431928             0.423051    0.256803   
2             6          0.465964             0.614902    0.258477   
3             7          0.465964             0.605310    0.253245   
4             8          0.431928             0.500209    0.256221   
...         ...               ...                  ...         ...   
24895     55731          0.397891             0.500000    0.251874   
24896     55732          0.534036             0.500000    0.251874   
24897     55733          0.534036             0.500000    0.251874   
24898     55734          0.329819             0.376340    0.251874   
24899     55735          0.329819             0.376340    0.251874   

       similarity_type  similarity_source  similarity_rating  similarity_date  \
0                 0.75               0.25               0.25         0.641713 