In [1]:
import numpy as np
import pandas as pd
import math
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def preprocess(anime_list): #TODO : Gérer les cas limites 
    anime_list = anime_list.copy()

    ## Dropping columns
    columns_to_keep = ['anime_id', 'Name', 'Genres', 'Synopsis', 'Episodes', 'Aired', 'Studios', 'Duration', 'Rating']
    anime_list = anime_list[columns_to_keep]

    ## Dealing with Genres : use one-hot encoding
    all_genres = set()
    for genres in anime_list['Genres']:
        all_genres.update(genres.split(', '))
    for genre in all_genres:
        anime_list[genre] = anime_list['Genres'].apply(lambda x: 1 if genre in x.split(', ') else 0)
    anime_list.drop(columns=['Genres'], inplace=True)

    ## Dealing with Episodes and Duration : calculate total length
    anime_list['Episodes'] = pd.to_numeric(anime_list['Episodes'], errors='coerce').fillna(0) #case to deal with
    hours = anime_list['Duration'].str.extract(r'(\d+) hr', expand=False).astype(float)
    minutes = anime_list['Duration'].str.extract(r'(\d+) min', expand=False).astype(float)
    hours.fillna(0, inplace=True)
    minutes.fillna(0, inplace=True)
    anime_list['Duration'] = hours * 60 + minutes
    anime_list['Total_Duration'] = anime_list['Duration'] * anime_list['Episodes']
    anime_list.drop(columns=['Episodes'], inplace=True)
    anime_list.drop(columns=['Duration'], inplace=True)

    ## Dealing with Aired => get starting date
    anime_list['Start_Date'] = pd.to_datetime(anime_list['Aired'].str.split(' to ').str[0], errors='coerce')
    anime_list.drop(columns=['Aired'], inplace=True)

    ## Dealing with Studios => keep the first (for now)
    anime_list['Studios'] = anime_list['Studios'].str.split(',', expand=True)[0]
    anime_list['Studios'] = anime_list['Studios'].str.strip()

    ## Dealing with Rating => simplify
    anime_list.loc[anime_list['Rating'].str.contains('G'), 'Rating'] = 'All Ages'
    anime_list.loc[anime_list['Rating'].str.contains('PG-13'), 'Rating'] = 'Teen'
    anime_list.loc[anime_list['Rating'].str.contains('PG'), 'Rating'] = 'Children'
    anime_list.loc[anime_list['Rating'].str.contains('Rx'), 'Rating'] = 'Hentai'
    anime_list.loc[anime_list['Rating'].str.contains(r'R\+'), 'Rating'] = 'Adult'
    anime_list.loc[anime_list['Rating'].str.contains('R'), 'Rating'] = 'Young Adult'
    anime_list.loc[~anime_list['Rating'].isin(['All Ages', 'Children', 'Teen', 'Young Adult', 'Hentai', 'Adult']), 'Rating'] = 'Other'

    ## Dealing with synopsis
    anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower())
    stop_words = set(stopwords.words('english'))
    anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x  : ' '.join([word for word in x.split() if word not in stop_words]))
    lemmatizer = WordNetLemmatizer()
    anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    #anime_list['Synopsis'] = anime_list['Synopsis'].apply(lambda x: ' '.join([word for word, pos in pos_tag(word_tokenize(x)) if pos.startswith(('JJ', 'NN', 'VB', 'RB'))]))

    return anime_list

In [3]:
def show(anime_list):
    nbl, nbc = anime_list.shape
    print("\nNombre de lignes :", nbl)
    print("\nNombre de colonnes :", nbc)
    print("\nInfos\n")
    print(anime_list.info())
    print("\nDescribe\n")
    print(anime_list.describe())
    print("\nHead\n")
    print(anime_list.head(40))

In [4]:
fav_anime_list = [21]
anime_list = pd.read_parquet('anime/anime.parquet')
anime_list = preprocess(anime_list)
show(anime_list)


Nombre de lignes : 24903

Nombre de colonnes : 29

Infos

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24903 entries, 0 to 24902
Data columns (total 29 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   anime_id        24903 non-null  int64         
 1   Name            24903 non-null  object        
 2   Synopsis        24903 non-null  object        
 3   Studios         24903 non-null  object        
 4   Rating          24903 non-null  object        
 5   Avant Garde     24903 non-null  int64         
 6   Boys Love       24903 non-null  int64         
 7   UNKNOWN         24903 non-null  int64         
 8   Drama           24903 non-null  int64         
 9   Erotica         24903 non-null  int64         
 10  Mystery         24903 non-null  int64         
 11  Sci-Fi          24903 non-null  int64         
 12  Supernatural    24903 non-null  int64         
 13  Ecchi           24903 non-null  int64         


In [9]:
def recommend_anime(similarities_tab):
    sorted_df = similarities_tab.sort_values(by='similarity', ascending=False)
    #sorted_df = similarities_tab.sort_values(by='total_similarity', ascending=False)
    top_anime_ids = sorted_df.head(30)['anime_id'].tolist()
    recommended_animes = []
    for anime_id in top_anime_ids:
        anime_name = anime_list.loc[anime_list['anime_id'] == anime_id, 'Name'].iloc[0]
        recommended_animes.append({'anime_id': anime_id, 'Name': anime_name})
    return pd.DataFrame(recommended_animes)

In [6]:
### Synopsis

def extract_keywords(anime_ids, anime_list):
    # récupérer tous les synopsis des animes favoris
    fav_anime_synopsis = anime_list.loc[anime_list['anime_id'].isin(anime_ids), 'Synopsis'].tolist()
    # concaténer l'ensemble de ces synopsis
    fav_anime_synopsis = ' '.join(fav_anime_synopsis)
    # récupérer les mots clés
    fav_anime_keywords = fav_anime_synopsis.split()
    fav_anime_keywords = [word.translate(str.maketrans('', '', string.punctuation)).lower() for word in fav_anime_keywords]
    stop_words = set(stopwords.words('english'))
    fav_anime_keywords = [word for word in fav_anime_keywords if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    fav_anime_keywords = [lemmatizer.lemmatize(word) for word in fav_anime_keywords]
    ## add here more advanced analysis ?
    return ' '.join(fav_anime_keywords)


def recommendation_synopsis_based(fav_anime_list, anime_list):
    # Extraction de mots-clés des synopsis des animes favoris
    fav_anime_keywords = extract_keywords(fav_anime_list, anime_list)

    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values

    # Calcul de la similarité cosinus entre les mots-clés générés des animes favoris et les synopsis de tous les autres animes
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix_other_anime = tfidf_vectorizer.fit_transform(anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'Synopsis'])
    tfidf_matrix_fav_anime = tfidf_vectorizer.transform([fav_anime_keywords]) 
    cosine_similarities = cosine_similarity(tfidf_matrix_other_anime, tfidf_matrix_fav_anime)

    return pd.DataFrame({'anime_id': anime_ids, 'similarity': cosine_similarities.flatten()})


synopsis_cosine_similarities_tab = recommendation_synopsis_based(fav_anime_list, anime_list)
recommended_animes = recommend_anime(synopsis_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

    anime_id                                               Name
0      12859                                  One Piece Film: Z
1      36215  One Piece: Episode of East Blue - Luffy to 4-n...
2      38234                       One Piece Movie 14: Stampede
3       5252                      One Piece: Romance Dawn Story
4       8740             One Piece Film: Strong World Episode 0
5       1237  One Piece: Oounabara ni Hirake! Dekkai Dekkai ...
6      19505                                       Kaizoku Ouji
7       1638                                Peter Pan no Bouken
8       4155                       One Piece Film: Strong World
9        464  One Piece Movie 06: Omatsuri Danshaku to Himit...
10     50385                           One Piece Characters Log
11       459                                 One Piece Movie 01
12     50410                                One Piece Film: Red
13      1238              One Piece: Mamore! Saigo no Dai Butai
14     14817              Mouretsu Pirat

In [7]:
### Genre

def recommendation_genre_based(fav_anime_list, anime_list):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values
    similarities = []

    fav_genres = anime_list.loc[anime_list['anime_id'].isin(fav_anime_list), ~anime_list.columns.isin(['anime_id', 'Name', 'Total_Duration', 'Start_Date', 'Studios', 'Rating', 'Synopsis'])].sum()
    fav_genres_prop = fav_genres / fav_genres.sum()

    other_anime_genres = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), ~anime_list.columns.isin(['Name', 'Total_Duration', 'Start_Date', 'Studios', 'Rating', 'Synopsis'])]
    for _, row in other_anime_genres.drop(columns=['anime_id']).iterrows():
        genre_similarity = sum(row[genre] * fav_genres_prop[genre] for genre in fav_genres_prop.index)
        similarities.append(genre_similarity)
       
    return pd.DataFrame({'anime_id': anime_ids, 'similarity': similarities})

genre_cosine_similarities_tab = recommendation_genre_based(fav_anime_list, anime_list)
recommended_animes = recommend_anime(genre_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

    anime_id                                               Name
0      27825                         Long Zhi Gu: Poxiao Qibing
1      38198          Nanatsu no Taizai: Eiyuu-tachi wa Hashagu
2      31821                    Arslan Senki (TV): Fuujin Ranbu
3      19951          Hunter x Hunter Movie 2: The Last Mission
4      51162  One Piece: Otoshidama Special - Tokubetsu Hou ...
5      40734                             Yao Shen Ji 4th Season
6      52368                                      AOTU Shijie 4
7       6633               Queen's Blade: Gyokuza wo Tsugu Mono
8      37254         Last Period: Owarinaki Rasen no Monogatari
9      43523                  Tsuki ga Michibiku Isekai Douchuu
10     37262                            Ta ga Tame no Alchemist
11     36345                                      AOTU Shijie 2
12      9521                                   Tie Shan Gongzhu
13      2249                                          Amon Saga
14     38297                            

In [8]:
### Duration

def recommendation_duration_based(fav_anime_list, anime_list):
    anime_ids = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'anime_id'].values
    similarities = []

    avg_fav_duration = anime_list.loc[anime_list['anime_id'].isin(fav_anime_list), 'Total_Duration'].mean()
    print(avg_fav_duration)

    other_anime_durations = anime_list.loc[~anime_list['anime_id'].isin(fav_anime_list), 'Total_Duration']

    for duration in other_anime_durations:
        relative_difference = abs(duration - avg_fav_duration) / max(duration, avg_fav_duration)
        duration_similarity = 1 - relative_difference
        similarities.append(duration_similarity)

    return pd.DataFrame({'anime_id': anime_ids, 'similarity': similarities})


duration_cosine_similarities_tab = recommendation_duration_based(fav_anime_list, anime_list)
recommended_animes = recommend_anime(duration_cosine_similarities_tab)
print(recommended_animes[['anime_id', 'Name']])

0.0
    anime_id                                               Name
0          1                                       Cowboy Bebop
1      40665   Shoujo☆Kageki Revue Starlight: Rondo Rondo Rondo
2      40661                 Jashin-chan Dropkick': Chitose-hen
3      40660                                              Start
4      40659  Senki Zesshou Symphogear XV: Senki Zesshou Shi...
5      40658                                          ** Kouhai
6      40656        Girls & Panzer: Saishuushou Part 2 Specials
7      40655                                    Melon no Kirime
8      40651                          Kandagawa Jet Girls Recap
9      40646                             Yes ka No ka Hanbun ka
10     40645                                     Kaigo to Ikiru
11     40644                              Miki no Mikoto (2020)
12     40642                                    Yakusoku (2011)
13     40641                             Little Wonders: Sneeze
14     40639           Tonari no Ie 

  relative_difference = abs(duration - avg_fav_duration) / max(duration, avg_fav_duration)


In [10]:
## Combination

print(genre_cosine_similarities_tab)
print(duration_cosine_similarities_tab)
print(synopsis_cosine_similarities_tab)

combined_tab = pd.merge(genre_cosine_similarities_tab, duration_cosine_similarities_tab, on='anime_id', suffixes=('_genre', '_duration'))
combined_tab = pd.merge(combined_tab, synopsis_cosine_similarities_tab, on='anime_id', suffixes=('', '_synopsis'))
combined_tab['total_similarity'] = 0.2*combined_tab['similarity_genre'] + 0.2*combined_tab['similarity_duration'] + 0.6*combined_tab['similarity']

print(combined_tab)

recommended_animes = recommend_anime(combined_tab)
print(recommended_animes[['anime_id', 'Name']])

       anime_id  similarity
0             1    0.333333
1             5    0.333333
2             6    0.666667
3             7    0.333333
4             8    0.666667
...         ...         ...
24897     55731    0.333333
24898     55732    1.000000
24899     55733    1.000000
24900     55734    0.000000
24901     55735    0.000000

[24902 rows x 2 columns]
       anime_id  similarity
0             1         0.0
1             5         0.0
2             6         0.0
3             7         0.0
4             8         0.0
...         ...         ...
24897     55731         NaN
24898     55732         NaN
24899     55733         NaN
24900     55734         0.0
24901     55735         0.0

[24902 rows x 2 columns]
       anime_id  similarity
0             1    0.013469
1             5    0.026405
2             6    0.027187
3             7    0.005047
4             8    0.031917
...         ...         ...
24897     55731    0.000000
24898     55732    0.000000
24899     55733    0.000