In [1]:
import pandas as pd
import re
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

In [2]:
df = pd.read_csv('anime_data.csv')

In [3]:
df.shape

(8807, 29)

In [4]:
df.columns

Index(['mal_id', 'url', 'trailer', 'title', 'title_english', 'type', 'source',
       'episodes', 'status', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'synopsis',
       'background', 'season', 'year', 'producers', 'licensors', 'studios',
       'genres', 'themes', 'demographics', 'image'],
      dtype='object')

In [5]:
df.isna().sum()

mal_id              0
url                 0
trailer          4916
title               0
title_english    3039
type                0
source              0
episodes           41
status              0
aired               0
duration            0
rating              0
score               0
scored_by           0
rank             1049
popularity          0
members             0
favorites           0
synopsis            1
background       7062
season           5080
year             5080
producers           0
licensors           0
studios             0
genres              0
themes              0
demographics        0
image               0
dtype: int64

In [6]:
df.duplicated().sum()

21

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.dropna(subset=['synopsis'],inplace=True)

In [9]:
df.shape

(8785, 29)

In [10]:
df = df.reset_index(drop=True)

In [11]:
data = df[['mal_id', 'url', 'trailer', 'title',
       'title_english', 'type', 'source',
       'episodes', 'status', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'synopsis',
       'background', 'season', 'year', 'producers', 'licensors',
       'studios', 'genres', 'themes', 'demographics',
       'image']] 

In [12]:
data = data[~data['studios'].apply(lambda x: x == [])]
data = data[~data['genres'].apply(lambda x: x == [])]

In [13]:
data.shape

(8785, 29)

SYNOPSIS DATA CLEANING

In [14]:
data[~((data.type=="TV")|(data.type=="Movie")|(data.type=="OVA")|(data.type=="OVA"))].sort_values(by = 'popularity')

Unnamed: 0,mal_id,url,trailer,title,title_english,type,source,episodes,status,aired,duration,rating,score,scored_by,rank,popularity,members,favorites,synopsis,background,season,year,producers,licensors,studios,genres,themes,demographics,image
6371,35120,https://myanimelist.net/anime/35120/Devilman__...,https://www.youtube.com/embed/ww06yGPM7Kc?enab...,Devilman: Crybaby,Devilman: Crybaby,ONA,Manga,10.0,Finished Airing,"Jan 5, 2018",25 min per ep,R+ - Mild Nudity,7.75,697123.0,1115.0,125,1149732,24154,Devils cannot take form without a living host....,Devilman: Crybaby adapts the entire original m...,,,"['Aniplex', 'Dynamic Planning', 'Netflix']",[],['Science SARU'],"['Action', 'Avant Garde', 'Horror', 'Supernatu...","['Gore', 'Mythology']",['Shounen'],https://cdn.myanimelist.net/images/anime/2/899...
7728,42310,https://myanimelist.net/anime/42310/Cyberpunk_...,https://www.youtube.com/embed/JtqIas3bYhg?enab...,Cyberpunk: Edgerunners,,ONA,Game,10.0,Finished Airing,"Sep 13, 2022",25 min per ep,R+ - Mild Nudity,8.60,507018.0,87.0,263,745211,24975,"Dreams are doomed to die in Night City, a futu...",Cyberpunk: Edgerunners is based on the Cyberpu...,,,['CD Projekt Red'],[],['Trigger'],"['Action', 'Sci-Fi']","['Gore', 'Organized Crime']",[],https://cdn.myanimelist.net/images/anime/1818/...
8397,51535,https://myanimelist.net/anime/51535/Shingeki_n...,https://www.youtube.com/embed/E7WytLM2KvY?enab...,Shingeki no Kyojin: The Final Season - Kankets...,Attack on Titan: Final Season - The Final Chap...,TV Special,Manga,2.0,Finished Airing,"Mar 4, 2023 to Nov 5, 2023",1 hr 12 min per ep,R - 17+ (violence & profanity),8.90,370952.0,22.0,327,642912,15295,In the wake of Eren Yeager's cataclysmic actio...,Shingeki no Kyojin: The Final Season - Kankets...,,,"['Production I.G', 'Dentsu', 'Mainichi Broadca...",[],['MAPPA'],"['Action', 'Drama', 'Suspense']","['Gore', 'Military', 'Survival']",['Shounen'],https://cdn.myanimelist.net/images/anime/1279/...
4415,15689,https://myanimelist.net/anime/15689/Nekomonoga...,https://www.youtube.com/embed/bHef90RByXI?enab...,Nekomonogatari: Kuro,Nekomonogatari Black,TV Special,Light novel,4.0,Finished Airing,"Dec 31, 2012",27 min per ep,R - 17+ (violence & profanity),7.92,338740.0,759.0,398,554179,1241,"After surviving a vampire attack, Koyomi Arara...",Nekomonogatari: Kuro adapts the sixth and fina...,,,"['Aniplex', 'Kodansha']",['Aniplex of America'],['Shaft'],"['Comedy', 'Romance', 'Supernatural', 'Ecchi']",[],[],https://cdn.myanimelist.net/images/anime/1170/...
8009,48661,https://myanimelist.net/anime/48661/JoJo_no_Ki...,https://www.youtube.com/embed/LdSVWTEibF0?enab...,JoJo no Kimyou na Bouken Part 6: Stone Ocean,JoJo's Bizarre Adventure: Stone Ocean,ONA,Manga,12.0,Finished Airing,"Dec 1, 2021",24 min per ep,R - 17+ (violence & profanity),8.13,280818.0,470.0,444,503242,6974,Conspiring forces frame Jolyne Kuujou for a re...,JoJo no Kimyou na Bouken Part 6: Stone Ocean a...,,,['Warner Bros. Japan'],['VIZ Media'],['David Production'],"['Action', 'Adventure', 'Supernatural']",[],['Shounen'],https://cdn.myanimelist.net/images/anime/1896/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3636,9568,https://myanimelist.net/anime/9568/Peace_Maker...,,Peace Maker Kurogane Special,Peace Maker Kurogane: Festival of Gion,Special,Manga,1.0,Finished Airing,"Dec 21, 2003",17 min,PG - Children,6.59,789.0,6700.0,9984,3015,4,Peace Maker Kurogane special.,,,,[],[],['Gonzo'],"['Action', 'Comedy', 'Drama']","['Historical', 'Samurai']",['Shounen'],https://cdn.myanimelist.net/images/anime/7/264...
8474,52420,https://myanimelist.net/anime/52420/Blue_Archi...,,Blue Archive: 1.5-shuunen Kinen Short Animation,Blue Archive: New Summer Animation PV,PV,Game,1.0,Finished Airing,"Jul 16, 2022",9 min,PG-13 - Teens 13 or older,6.92,1352.0,,9989,3011,7,This is a short animation commemorating the 1....,,,,[],[],['Yostar Pictures'],['Fantasy'],[],[],https://cdn.myanimelist.net/images/anime/1818/...
6896,37659,https://myanimelist.net/anime/37659/Taimanin_Y...,,Taimanin Yukikaze Special,,Special,Visual novel,1.0,Finished Airing,"Jan 29, 2016",5 min,Rx - Hentai,5.93,966.0,,9994,3009,11,Side story of Taimanin Yukikaze.,,,,['ZIZ Entertainment (ZIZ)'],[],['Magic Bus'],"['Action', 'Supernatural', 'Hentai']","['Martial Arts', 'Mythology']",[],https://cdn.myanimelist.net/images/anime/1159/...
2368,3248,https://myanimelist.net/anime/3248/Tenpou_Ibun...,,Tenpou Ibun: Ayakashi Ayashi - Ayashi Shinkyoku,Ghost Slayers Ayashi: Inferno,Special,Original,5.0,Finished Airing,"Aug 22, 2007 to Oct 24, 2007",24 min per ep,PG-13 - Teens 13 or older,6.77,1029.0,5610.0,9994,3005,1,Specials set six months after the end of Tenpo...,,,,[],[],['Bones'],"['Action', 'Drama', 'Supernatural']",['Historical'],[],https://cdn.myanimelist.net/images/anime/9/509...


In [15]:
import re

pattern = r"\[Written by MAL Rewrite\]|\(.*Source:.*\)" 

# Removing the pattern using regular expressions
data['synopsis'] = data['synopsis'].str.replace(pattern, '', regex=True).values



In [16]:
def remove_newline_numbers(text):
    text = text.replace('\n', '')
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

In [17]:
data['synopsis_cleaned'] = data.synopsis.apply(remove_newline_numbers)

In [18]:
import spacy
nlp = spacy.load('en_core_web_sm')
data['synopsis_cleaned'] = data['synopsis_cleaned'].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if not token.is_stop]))

In [19]:
rating_map = {
    "PG-13 - Teens 13 or older": "PG-13",
    "R - 17+ (violence & profanity)": "R17",
    "Rx - Hentai": "Rx",
    "R+ - Mild Nudity": "R+",
    "G - All Ages": "G",
    "PG - Children": "PG"
}

# Use the map to replace the values in the 'rating' column
data['rating'] = data['rating'].replace(rating_map)

In [20]:
data.shape

(8785, 30)

In [21]:
data['themes'] = data['themes'].apply(lambda x:"unknown_theme" if x == [] else x )
data['demographics'] = data['demographics'].apply(lambda x:"unknown_demographics" if x == [] else x )

In [22]:
data.year = data.aired.str.split(',').str[1].str[1:5]

In [23]:
def get_season(x):
    spring = ["Mar","Apr","May"]
    summer = ["Jun","Jul","Aug"]
    fall = ["Sep","Oct","Nov"]
    winter = ["Dec","Jan","Feb"]
    y = x[:3]
    if y in spring:
        return "spring"
    elif y in winter:
        return "winter"
    elif y in fall:
        return "fall"
    elif y in summer:
        return "summer"

    

In [24]:
data.season = data.aired.apply(get_season)

In [25]:
def fill_na(row):
    if pd.isna(row['year']):
        if len(row['aired']) == 4:
            return row['aired']
        elif len(row['aired']) == 12:
            return row['aired'][:4]
        else:
            return row['aired'][4:8]
    else:
        return row['year']

# Apply the function to each row of the DataFrame
data['year'] = data.apply(fill_na, axis=1)


In [26]:
data.season = data.season.fillna("unknownseason")

In [27]:
data.isna().sum()

mal_id                 0
url                    0
trailer             4907
title                  0
title_english       3028
type                   0
source                 0
episodes              40
status                 0
aired                  0
duration               0
rating                 0
score                  0
scored_by              0
rank                1046
popularity             0
members                0
favorites              0
synopsis               0
background          7041
season                 0
year                   0
producers              0
licensors              0
studios                0
genres                 0
themes                 0
demographics           0
image                  0
synopsis_cleaned       0
dtype: int64

In [28]:
import ast

def one_hot_encode(df, column):
    
    # Convert string representation of lists into actual lists
    df[column] = df[column].apply(ast.literal_eval)

    # Convert list of genres to string
    df[column] = df[column].apply(lambda x: ','.join(x))

    # Apply one-hot encoding and return the result
    return df[column].str.get_dummies(sep=',')



In [29]:
genres_df = one_hot_encode(data, 'genres')
producers_df = one_hot_encode(data, 'producers')
studios_df = one_hot_encode(data, 'studios')
themes_df = one_hot_encode(data, 'themes')
demographics_df = one_hot_encode(data, 'demographics')
licensors_df = one_hot_encode(data, 'licensors')

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
vectorizer = TfidfVectorizer(max_features=8000)  # Adjust max_features as needed
overview_matrix = vectorizer.fit_transform(data['synopsis_cleaned'])

In [32]:
overview_df = overview_matrix.toarray()
overview_df = pd.DataFrame(overview_df)

In [33]:
status_df = data.status.str.get_dummies()
season_df = data.season.str.get_dummies()
type_df = data.type.str.get_dummies()
source_df = data.source.str.get_dummies()
rating_df = data.rating.str.get_dummies()


In [82]:
combined_features = pd.concat([overview_df,type_df,source_df,genres_df,rating_df,themes_df,demographics_df],axis=1)

In [83]:
combined_features.shape

(8785, 8108)

In [84]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(combined_features)

In [85]:
def recommend(anime):
    index = data[(data['title'] == anime) | (data['title_english'] == anime)].index[0]
    distances = sorted(list(enumerate(similarity_matrix[index])),reverse=True,key= lambda x:x[1])
    for i in distances[1:10]:
        
        print(data.iloc[i[0]].title,"---",i[1])

In [87]:
recommend("Naruto")

Naruto: Shippuuden --- 0.9324073728341525
Boruto: Naruto Next Generations --- 0.9285702614908151
Nanatsu no Taizai: Imashime no Fukkatsu --- 0.8400943348225696
Dragon Quest: Dai no Daibouken (2020) --- 0.8372779450193456
Nanatsu no Taizai: Funnu no Shinpan --- 0.8326332341477566
Magi: Sinbad no Bouken (TV) --- 0.8306382271044592
Nanatsu no Taizai: Kamigami no Gekirin --- 0.8293357432895804
Magi: The Kingdom of Magic --- 0.8288897030707874
Nanatsu no Taizai: Mokushiroku no Yonkishi --- 0.8288316581832843


In [88]:
import pickle
pickle.dump(data,open('anime.pkl','wb'))
pickle.dump(similarity_matrix,open('similarity.pkl','wb'))