In [26]:
import numpy as np
import pandas as pd 
import ast
from matplotlib import pyplot as plt
import seaborn as sns
import torch

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to /Users/jerometam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jerometam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jerometam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df_anime = pd.read_csv('/Users/jerometam/Desktop/DATABASE/my_anime)/animes.csv')
df_profile = pd.read_csv('/Users/jerometam/Desktop/DATABASE/my_anime)/profiles.csv')
df_reviews = pd.read_csv('/Users/jerometam/Desktop/DATABASE/my_anime)/reviews.csv')

In [3]:
#df_anime.head(1)
#df_profile.head(1)
#ddf_reviews.head(1)

# Data Cleaning

Drop useless columns and useless row duplicates

In [4]:
df_anime = df_anime.drop(columns= ['img_url','link'])
df_profile = df_profile.drop(columns='link')
df_reviews = df_reviews.drop(columns='link')

df_anime = df_anime.drop_duplicates(subset='title', keep='first')
df_anime = df_anime.drop_duplicates(subset='synopsis', keep='first')
df_reviews  = df_reviews.drop_duplicates(subset=['profile','anime_uid'], keep='first')
df_profile = df_profile.drop_duplicates(subset='profile', keep='first')

Check null/nan data 

In [5]:

print(f'Anime data : \n {df_anime.isnull().sum()} \n\n Reviews data : \n{df_reviews.isnull().sum()}\n\n  Profile data : \n{df_profile.isnull().sum()} ')


Anime data : 
 uid              0
title            0
synopsis         1
genre            0
aired            0
episodes       385
members          0
popularity       0
ranked        1441
score          256
dtype: int64 

 Reviews data : 
uid          0
profile      0
anime_uid    0
text         0
score        0
scores       0
dtype: int64

  Profile data : 
profile                0
gender             17007
birthday           21037
favorites_anime        0
dtype: int64 


In [6]:
#Check nan data 
print(f'Anime data : \n {df_anime.isna().sum()} \n\n Reviews data : \n{df_reviews.isna().sum()}\n\n  Profile data : \n{df_profile.isna().sum()} ')

Anime data : 
 uid              0
title            0
synopsis         1
genre            0
aired            0
episodes       385
members          0
popularity       0
ranked        1441
score          256
dtype: int64 

 Reviews data : 
uid          0
profile      0
anime_uid    0
text         0
score        0
scores       0
dtype: int64

  Profile data : 
profile                0
gender             17007
birthday           21037
favorites_anime        0
dtype: int64 


Get rid of nan data and only keep columns we want for our model

In [7]:
df_anime = df_anime.dropna(subset=['synopsis'])
df_anime = df_anime.drop(columns=['episodes','ranked'])
df_anime['score'] = df_anime['score'].fillna(0)
df_profile = df_profile.drop(columns=['gender','birthday'])

Merge anime name with reviews and anime name to profile

In [8]:
df_reviews = pd.merge(df_reviews, df_anime.rename(columns={'uid': "anime_uid"})[["anime_uid",'title']], on='anime_uid', how='left')
df_reviews = df_reviews.dropna(subset=['title'])

In [9]:

uid_to_name = pd.Series(df_anime[['uid','title']].title.values, index=df_anime[['uid','title']].uid).to_dict()
def replace_uids_with_names(uid_list):
    return [uid_to_name.get(int(uid), int(uid)) for uid in uid_list]


df_profile['favorites_anime_name'] = df_profile['favorites_anime'].apply(ast.literal_eval).apply(replace_uids_with_names)




In [10]:
#df_anime.head(2)
#df_reviews.head(2)
#df_profile.head(2)

In [11]:
print(f'Anime data : \n {df_anime.isna().sum()} \n\n Reviews data : \n{df_reviews.isna().sum()}\n\n  Profile data : \n{df_profile.isna().sum()} ')

Anime data : 
 uid           0
title         0
synopsis      0
genre         0
aired         0
members       0
popularity    0
score         0
dtype: int64 

 Reviews data : 
uid          0
profile      0
anime_uid    0
text         0
score        0
scores       0
title        0
dtype: int64

  Profile data : 
profile                 0
favorites_anime         0
favorites_anime_name    0
dtype: int64 


In [12]:
df_anime = df_anime.reset_index(drop=True)
df_reviews = df_reviews.reset_index(drop=True)
df_profile = df_profile.reset_index(drop=True)

In [13]:
print(f' Unique in anime : \n {df_anime.nunique()} \n\n Unique in reviews: \n {df_reviews.nunique()}\n\n Unique profile: \n {df_profile["profile"].nunique()}')
print(f'\n\nLen Anime Data {len(df_anime)}, Len Review Data {len(df_reviews)}, Len profile Data {len(df_profile)}')

 Unique in anime : 
 uid           15192
title         15192
synopsis      15192
genre          4769
aired         10625
members        8515
popularity    13507
score           574
dtype: int64 

 Unique in reviews: 
 uid          129970
profile       47754
anime_uid      7873
text         129895
score            12
scores        29745
title          7873
dtype: int64

 Unique profile: 
 47885


Len Anime Data 15192, Len Review Data 129970, Len profile Data 47885


# Model 

Section to prepare the model I will use 'genre', 'synopsis' to create recomendation model

In [14]:
tmp = df_anime[['title','synopsis','genre']]

In [15]:
def clean_genre_string(genre_str):
    cleaned_str = genre_str.strip("[]").replace("'", "")
    return ', '.join(cleaned_str.split(', '))

tmp['genre'] = tmp['genre'].apply(clean_genre_string)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['genre'] = tmp['genre'].apply(clean_genre_string)


In [16]:
tmp['synopsis+genre'] = tmp['synopsis'] + "" + tmp['genre']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['synopsis+genre'] = tmp['synopsis'] + "" + tmp['genre']


In [18]:
def preprocess_synopsis(synopsis):
    
    words = word_tokenize(synopsis.lower())
   
    words = [word for word in words if word.isalnum() and word not in stopwords.words('english')]
   
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [19]:

def get_tfidf_matrix(synopses):
    tfidf = TfidfVectorizer(
        min_df=3,
        max_features=None,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 3),
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=True
    )
    tfidf_matrix = tfidf.fit_transform(synopses)
    return tfidf_matrix, tfidf

In [20]:
tmp['processed_synopsis+genre'] = tmp['synopsis+genre'].apply(preprocess_synopsis)
tmp['processed_synopsis'] = tmp['synopsis'].apply(preprocess_synopsis)
tmp['processed_genre'] = tmp['genre'].apply(preprocess_synopsis)
tfidf_matrix_sg, vectorizer_sg = get_tfidf_matrix(tmp['synopsis+genre'])
tfidf_matrix_s, vectorizer_s = get_tfidf_matrix(tmp['synopsis'])
tfidf_matrix_g, vectorizer_g = get_tfidf_matrix(tmp['genre'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['processed_synopsis+genre'] = tmp['synopsis+genre'].apply(preprocess_synopsis)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['processed_synopsis'] = tmp['synopsis'].apply(preprocess_synopsis)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['processed_genre'] = tmp['genre'].apply(preproc

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(tfidf_matrix, vectorizer, new_synopsis):
    
    processed_synopsis = preprocess_synopsis(new_synopsis)
    
    synopsis_vector = vectorizer.transform([processed_synopsis])
    
    similarity_scores = cosine_similarity(synopsis_vector, tfidf_matrix)
    return similarity_scores

In [22]:
def recommend_movies(df,tfidf_matrix, vectorizer, new_synopsis, top_n=3):
    
    similarity_scores = calculate_similarity(tfidf_matrix, vectorizer, new_synopsis)
    
    top_indices = similarity_scores[0].argsort()[-top_n:][::-1]
    return df.iloc[top_indices][['title', 'synopsis']]

Testing the model with "Fullmetal Alchemist: Brotherhood"

In [23]:
new_synopsis = tmp['synopsis'][3]

recommended_movies = recommend_movies(tmp,tfidf_matrix_s, vectorizer_s, new_synopsis, top_n=10)
print("Recommended Movies based synopsis:")
recommended_movies

Recommended Movies based synopsis:


Unnamed: 0,title,synopsis
3,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth..."
498,Fullmetal Alchemist,"Edward Elric, a young, brilliant alchemist, ha..."
2903,Fullmetal Alchemist: The Sacred Star of Milos ...,To mark the July 2 opening of the Fullmetal Al...
13398,Fullmetal Alchemist: The Sacred Star of Milos,Chasing a runaway alchemist with strange power...
5894,Fullmetal Alchemist: The Conqueror of Shamballa,"In desperation, Edward Elric sacrificed his bo..."
339,Fullmetal Alchemist: Brotherhood Specials,Amazing secrets and startling facts are expose...
3538,Joker Game,"With World War II right around the corner, int..."
8066,Ulysses: Jehanne Darc to Renkin no Kishi,"The story is set in the 15th century, during t..."
4040,Sword Gai,A promotional video for the manga. \r\n \r\nTh...
12942,Dr. Stone: Stone Wars,Second season of Dr. Stone .


In [24]:
new_synopsis = tmp['synopsis'][3]

recommended_movies = recommend_movies(tmp,tfidf_matrix_g, vectorizer_g, new_synopsis, top_n=10)
print("Recommended Movies based on genre:")
recommended_movies

Recommended Movies based on genre:


Unnamed: 0,title,synopsis
7988,Sakana no Kuni,Short movie with Fish.
6714,Osaru no Sankichi: Boukuusen,Monkeys battle polar bears in air combat. Sho...
6409,Tonpei to Sarukichi,A lazy pig Tonpei steals food from the monkeys...
7627,Rekkoku Rikugun,During a night of drinking Maru-san (Mr. Circl...
6896,Osaru no Sankichi: Totsugeki-tai,Short movie from 1934 about a war between monk...
12800,Youjo Senki: Senkyou Houkoku,Recap of the first 6 episodes of Youjo Senki .
7017,Norakuro Shoui: Nichiyoubi no Kaijiken,Norakuro's adventures in the Fierce Dog Brigad...
3917,Busou Chuugakusei: Basket Army,The story is set in August of 2026 at the Fuji...
7590,Momotarou: Umi no Shinpei,"A monkey, a dog, a pheasant, and a bear travel..."
1319,Senjuushi: Kijuushi-tachi no Happy Birthday!,The story will be about the characters celebra...


In [25]:
new_synopsis = tmp['synopsis'][3]

recommended_movies = recommend_movies(tmp,tfidf_matrix_sg, vectorizer_sg, new_synopsis, top_n=10)
print("Recommended Movies based on synopsis+genre:")
recommended_movies

Recommended Movies based on synopsis+genre:


Unnamed: 0,title,synopsis
3,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth..."
498,Fullmetal Alchemist,"Edward Elric, a young, brilliant alchemist, ha..."
2903,Fullmetal Alchemist: The Sacred Star of Milos ...,To mark the July 2 opening of the Fullmetal Al...
13398,Fullmetal Alchemist: The Sacred Star of Milos,Chasing a runaway alchemist with strange power...
5894,Fullmetal Alchemist: The Conqueror of Shamballa,"In desperation, Edward Elric sacrificed his bo..."
339,Fullmetal Alchemist: Brotherhood Specials,Amazing secrets and startling facts are expose...
3538,Joker Game,"With World War II right around the corner, int..."
8066,Ulysses: Jehanne Darc to Renkin no Kishi,"The story is set in the 15th century, during t..."
4732,Code Geass: Boukoku no Akito 2 - Hikisakareshi...,"With her previous triumphs under her belt, Lei..."
4040,Sword Gai,A promotional video for the manga. \r\n \r\nTh...


# Conclusion

Using NLP to create a recommendation system appears to yield better results when incorporating both synopsis and genre, rather than relying solely on either genre or synopsis alone. However, the model could be enhanced further by incorporating user preferences, such as specifying whether they prefer movies or series.

For instance, in the current setup, there's a tendency for the system to prioritize related movies and OVAs (Original Video Animations) over other anime series, as seen in the example of Fullmetal Alchemist: Brotherhood. Implementing a feature that allows users to specify their preference for movies or series could significantly improve the relevance of recommendations.