In [2]:
import pandas as pd
import re
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

In [3]:
df = pd.read_csv('anime_data.csv')

In [4]:
df.shape

(8807, 29)

In [5]:
df.columns

Index(['mal_id', 'url', 'trailer', 'title', 'title_english', 'type', 'source',
       'episodes', 'status', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'synopsis',
       'background', 'season', 'year', 'producers', 'licensors', 'studios',
       'genres', 'themes', 'demographics', 'image'],
      dtype='object')

In [6]:
df.isna().sum()

mal_id              0
url                 0
trailer          4916
title               0
title_english    3039
type                0
source              0
episodes           41
status              0
aired               0
duration            0
rating              0
score               0
scored_by           0
rank             1049
popularity          0
members             0
favorites           0
synopsis            1
background       7062
season           5080
year             5080
producers           0
licensors           0
studios             0
genres              0
themes              0
demographics        0
image               0
dtype: int64

In [7]:
df.duplicated().sum()

21

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.dropna(subset=['synopsis'],inplace=True)

In [10]:
df.shape

(8785, 29)

In [11]:
df = df.reset_index(drop=True)

In [12]:
data = df[['mal_id', 'url', 'trailer', 'title',
       'title_english', 'type', 'source',
       'episodes', 'status', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'synopsis',
       'background', 'season', 'year', 'producers', 'licensors',
       'studios', 'genres', 'themes', 'demographics',
       'image']] 

In [13]:
data = data[~data['studios'].apply(lambda x: x == [])]
data = data[~data['genres'].apply(lambda x: x == [])]

In [14]:
data.shape

(8785, 29)

SYNOPSIS DATA CLEANING

In [15]:
import re

pattern = r"\[Written by MAL Rewrite\]|\(.*Source:.*\)" 

# Removing the pattern using regular expressions
data['synopsis'] = data['synopsis'].str.replace(pattern, '', regex=True).values



In [16]:
def remove_newline_numbers(text):
    text = text.replace('\n', '')
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

In [17]:
data['synopsis_cleaned'] = data.synopsis.apply(remove_newline_numbers)

In [18]:
import spacy
nlp = spacy.load('en_core_web_sm')
data['synopsis_cleaned'] = data['synopsis_cleaned'].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if not token.is_stop]))

In [19]:
rating_map = {
    "PG-13 - Teens 13 or older": "PG-13",
    "R - 17+ (violence & profanity)": "R17",
    "Rx - Hentai": "Rx",
    "R+ - Mild Nudity": "R+",
    "G - All Ages": "G",
    "PG - Children": "PG"
}

# Use the map to replace the values in the 'rating' column
data['rating'] = data['rating'].replace(rating_map)

In [20]:
data.shape

(8785, 30)

In [21]:
data['themes'] = data['themes'].apply(lambda x:"unknown_theme" if x == [] else x )
data['demographics'] = data['demographics'].apply(lambda x:"unknown_demographics" if x == [] else x )

In [22]:
data.year = data.aired.str.split(',').str[1].str[1:5]

In [23]:
def get_season(x):
    spring = ["Mar","Apr","May"]
    summer = ["Jun","Jul","Aug"]
    fall = ["Sep","Oct","Nov"]
    winter = ["Dec","Jan","Feb"]
    y = x[:3]
    if y in spring:
        return "spring"
    elif y in winter:
        return "winter"
    elif y in fall:
        return "fall"
    elif y in summer:
        return "summer"

    

In [24]:
data.season = data.aired.apply(get_season)

In [25]:
def fill_na(row):
    if pd.isna(row['year']):
        if len(row['aired']) == 4:
            return row['aired']
        elif len(row['aired']) == 12:
            return row['aired'][:4]
        else:
            return row['aired'][4:8]
    else:
        return row['year']

# Apply the function to each row of the DataFrame
data['year'] = data.apply(fill_na, axis=1)


In [26]:
data.season = data.season.fillna("unknownseason")

In [27]:
data.isna().sum()

mal_id                 0
url                    0
trailer             4907
title                  0
title_english       3028
type                   0
source                 0
episodes              40
status                 0
aired                  0
duration               0
rating                 0
score                  0
scored_by              0
rank                1046
popularity             0
members                0
favorites              0
synopsis               0
background          7041
season                 0
year                   0
producers              0
licensors              0
studios                0
genres                 0
themes                 0
demographics           0
image                  0
synopsis_cleaned       0
dtype: int64

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
vectorizer = TfidfVectorizer(max_features=8000)  # Adjust max_features as needed
overview_matrix = vectorizer.fit_transform(data['synopsis_cleaned'])

In [30]:
overview_matrix = overview_matrix.toarray()
overview_df = pd.DataFrame(overview_matrix)

In [31]:
status_df = data.status.str.get_dummies()
season_df = data.season.str.get_dummies()
type_df = data.type.str.get_dummies()
source_df = data.source.str.get_dummies()
rating_df = data.rating.str.get_dummies()
genres_df = data.genres.str.get_dummies(sep=',')
studios_df = data.studios.str.get_dummies(sep=',')
licensors_df = data.licensors.str.get_dummies(sep=',')
producers_df = data.producers.str.get_dummies(sep=',')
themes_df = data.themes.str.get_dummies(sep=',')
demographics_df = data.demographics.str.get_dummies(sep=',')

In [32]:
combined_features = pd.concat([genres_df,type_df,source_df,rating_df,studios_df,themes_df,demographics_df,overview_df],axis=1)

In [33]:
combined_features.shape

(8785, 9373)

In [34]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(combined_features)

In [35]:
def recommend(anime):
    index = data[(data['title'] == anime) | (data['title_english'] == anime)].index[0]
    distances = sorted(list(enumerate(similarity_matrix[index])),reverse=True,key= lambda x:x[1])
    for i in distances[1:20]:
        print(data.iloc[i[0]].title,"---",i[1])

In [36]:
recommend("Chainsaw Man")

Katsute Kami Datta Kemono-tachi e --- 0.7433626482924653
Jujutsu Kaisen --- 0.7402124595216275
Jujutsu Kaisen 2nd Season --- 0.7395378101646203
Peach Boy Riverside --- 0.7390245134137886
Jigokuraku --- 0.6763565187146278
Ragna Crimson --- 0.6709178443680544
Kimetsu no Yaiba: Katanakaji no Sato-hen --- 0.6693141111743878
Jujutsu Kaisen 0 Movie --- 0.669256434606162
Kimetsu no Yaiba: Mugen Ressha-hen --- 0.6679876807539057
Kimetsu no Yaiba: Yuukaku-hen --- 0.6679215999711178
Jujutsu Kaisen 2nd Season Recaps --- 0.6666666666666665
Kekkai Sensen --- 0.6365685809432907
Kimetsu no Yaiba --- 0.6364184784607624
Undead Unluck --- 0.6359236082038375
Bleach: Sennen Kessen-hen --- 0.6356326813341365
Bleach: Sennen Kessen-hen - Ketsubetsu-tan --- 0.6350733226490073
Kekkai Sensen & Beyond --- 0.6347372661266141
Akame ga Kill! --- 0.6343973260472728
Claymore --- 0.6335122121649444


In [39]:
import pickle
pickle.dump(data,open('anime.pkl','wb'))
pickle.dump(similarity_matrix,open('similarity.pkl','wb'))