In [1]:
import pandas as pd 
import numpy as np

In [18]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

**Reading the data**

In [3]:
df = pd.read_csv('data/jikan_final.csv')

In [4]:
df.shape

(26720, 36)

In [5]:
df.nunique()

mal_id             26564
url                26564
images             26364
trailer             4920
approved               1
titles             26564
title              26563
title_english      10998
title_japanese     25462
title_synonyms     12463
type                   9
source                17
episodes             250
status                 3
airing                 2
aired              16116
duration             333
rating                 6
score                559
scored_by           8712
rank               16055
popularity         20364
members            11508
favorites           1901
synopsis           21510
background          2556
season                 4
year                  65
broadcast            623
producers           4701
licensors            265
studios             1681
genres               962
explicit_genres        1
themes               948
demographics           8
dtype: int64

In [6]:
user = pd.read_csv("data/user.csv")

In [7]:
user.nunique()

User ID        35141
Username       35140
Anime ID       14037
Anime Title    14067
Score             10
dtype: int64

In [8]:
user.shape

(3752106, 5)

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
user.drop_duplicates(inplace=True)

In [11]:
counts1 = user['Anime ID'].value_counts()
filtered_user = user[user["Anime ID"].isin(counts1[counts1>=5].index)]

In [12]:
filtered_user.shape

(3743711, 5)

In [13]:
filtered_user.nunique()

User ID        35139
Username       35138
Anime ID       10347
Anime Title    10378
Score             10
dtype: int64

**Filtering out Anime**

In [14]:
not_yet_aired = df[df.status == "Not yet aired"]

In [None]:
df = df[df.favorites != 0]


In [15]:
df1 = df[df['mal_id'].isin(filtered_user['Anime ID'])]

In [16]:
df1.shape

(10347, 36)

In [17]:
df1.isna().sum()

mal_id                0
url                   0
images                0
trailer               0
approved              0
titles                0
title                 0
title_english      3830
title_japanese       12
title_synonyms        0
type                  0
source                0
episodes             22
status                0
airing                0
aired                 0
duration              0
rating               10
score                 3
scored_by             3
rank               1426
popularity            0
members               0
favorites             0
synopsis             72
background         8527
season             6310
year               6310
broadcast             0
producers             0
licensors             0
studios               0
genres                0
explicit_genres       0
themes                0
demographics          0
dtype: int64

**Dropping Unwanted Columns**

In [19]:
df1.dropna(subset=['synopsis','rating'],inplace=True)

**Extracting only relevant information**

In [20]:
import ast 
df1.producers = df1.producers.apply(ast.literal_eval)
df1.images = df1.images.apply(ast.literal_eval)
df1.trailer = df1.trailer.apply(ast.literal_eval)
df1.titles = df1.titles.apply(ast.literal_eval)
df1.aired = df1.aired.apply(ast.literal_eval)
df1.broadcast = df1.broadcast.apply(ast.literal_eval)
df1.licensors = df1.licensors.apply(ast.literal_eval)
df1.studios = df1.studios.apply(ast.literal_eval)
df1.genres = df1.genres.apply(ast.literal_eval)
df1.themes = df1.themes.apply(ast.literal_eval)
df1.demographics = df1.demographics.apply(ast.literal_eval)


In [21]:
def extract_info(row):
    # Extract producer names
    producer_names = [producer['name'] for producer in row['producers']]
    licensors_names = [licensor['name'] for licensor in row['licensors']]
    studios_names = [studio['name'] for studio in row['studios']]
    genres = [genre['name'] for genre in row['genres']]
    themes = [theme['name'] for theme in row['themes']]
    demographics = [dg['name'] for dg in row['demographics']]
    
    # Extract embed_url from trailer
    embed_url = row['trailer']['embed_url'] if row['trailer'] else None
    aired = row['aired']['string'] if row['aired'] else None
    # Extract large_image_url from images
    large_image_url = row['images']['jpg']['large_image_url'] if row['images'] else None
    
    return pd.Series([producer_names, licensors_names,studios_names,genres,themes,demographics,embed_url,aired, large_image_url])

# Apply the function to each row of the DataFrame
df1[['producers','licensors','studios','genres','themes','demographics','trailer','aired','image']] = df1.apply(extract_info, axis=1)


In [23]:

df1 = df1[~df1['genres'].apply(lambda x: x == [])]

In [25]:
df1 = df1.reset_index(drop=True)

**DATA CLEANING**

In [26]:
import re

pattern = r"\[Written by MAL Rewrite\]|\(.*Source:.*\)" 

# Removing the pattern using regular expressions
df1['synopsis'] = df1['synopsis'].str.replace(pattern, '', regex=True).values


In [27]:
def remove_newline_numbers(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    return text.lower()

In [28]:
df1['synopsis_cleaned'] = df1.synopsis.apply(remove_newline_numbers)

In [29]:
import spacy
nlp = spacy.load('en_core_web_sm')
df1['synopsis_cleaned'] = df1['synopsis_cleaned'].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if not token.is_stop]))

In [30]:
rating_map = {
    "PG-13 - Teens 13 or older": "PG-13",
    "R - 17+ (violence & profanity)": "R17",
    "Rx - Hentai": "Rx",
    "R+ - Mild Nudity": "R+",
    "G - All Ages": "G",
    "PG - Children": "PG"
}

# Use the map to replace the values in the 'rating' column
df1['rating'] = df1['rating'].replace(rating_map)

In [31]:
df1['themes'] = df1['themes'].apply(lambda x:["unknown_theme"] if x == [] else x )
df1['demographics'] = df1['demographics'].apply(lambda x:["unknown_demographics"] if x == [] else x )



In [32]:
def get_season(x):
    spring = ["Mar","Apr","May"]
    summer = ["Jun","Jul","Aug"]
    fall = ["Sep","Oct","Nov"]
    winter = ["Dec","Jan","Feb"]
    y = x[:3]
    if y in spring:
        return "spring"
    elif y in winter:
        return "winter"
    elif y in fall:
        return "fall"
    elif y in summer:
        return "summer"

In [33]:
df1.season = df1.aired.apply(get_season)

In [34]:
df1.year = df1.aired.str.split(',').str[1].str[1:5]

In [35]:
def fill_na(row):
    if pd.isna(row['year']):
        if len(row['aired']) == 4:
            return row['aired']
        elif len(row['aired']) == 12:
            return row['aired'][:4]
        else:
            return row['aired'][4:8]
    else:
        return row['year']

# Apply the function to each row of the DataFrame
df1['year'] = df1.apply(fill_na, axis=1)

In [36]:
df1.season = df1.season.fillna("unknownseason")

**Selecting only relevant columns**

In [37]:
data = df1[['mal_id', 'url', 'trailer', 'title',
       'title_english', 'type', 'source',
       'episodes', 'status', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'synopsis','synopsis_cleaned',
       'background', 'season', 'year', 'producers', 'licensors',
       'studios', 'genres', 'themes', 'demographics',
       'image']] 

In [38]:
data['producers'] = data['producers'].apply(lambda x: ','.join(x))
data['licensors'] = data['licensors'].apply(lambda x: ','.join(x))
data['genres'] = data['genres'].apply(lambda x: ','.join(x))
data['studios'] = data['studios'].apply(lambda x: ','.join(x))
data['themes'] = data['themes'].apply(lambda x: ','.join(x))
data['demographics'] = data['demographics'].apply(lambda x: ','.join(x))

In [39]:
data = data[~(data.genres.str.contains("Hentai")|data.genres.str.contains("Erotica")|data.genres.str.contains("Boys Love")|data.genres.str.contains("Girls Love"))]

In [40]:
# Assuming df is your DataFrame and 'genres' is your column with the genres
genre_counts = {}
for row in data['genres']:
    for genre in row.split(','):
        if genre in genre_counts:
            genre_counts[genre] += 1
        else:
            genre_counts[genre] = 1

print(genre_counts)

{'Action': 3239, 'Award Winning': 192, 'Sci-Fi': 2025, 'Adventure': 2006, 'Drama': 1702, 'Mystery': 668, 'Supernatural': 899, 'Fantasy': 2435, 'Sports': 386, 'Comedy': 3501, 'Romance': 1490, 'Slice of Life': 655, 'Suspense': 276, 'Ecchi': 699, 'Gourmet': 82, 'Avant Garde': 134, 'Horror': 318}


In [41]:
data = data[data.favorites != 0]


In [42]:
data = data.reset_index(drop=True)

**FEATURE ENGINEERING**

In [43]:
genres_df = data.genres.str.get_dummies(sep=',')
studios_df = data.studios.str.get_dummies(sep=',')
themes_df = data.themes.str.get_dummies(sep=',')
demographics_df = data.demographics.str.get_dummies(sep=',')


In [44]:
status_df = data.status.str.get_dummies()
season_df = data.season.str.get_dummies()
type_df = data.type.str.get_dummies()
source_df = data.source.str.get_dummies()
rating_df = data.rating.str.get_dummies()

In [45]:
data.year = data.year.astype('int')

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
vectorizer = TfidfVectorizer()  # Adjust max_features as needed
overview_matrix = vectorizer.fit_transform(data['synopsis_cleaned'])

In [48]:
overview_matrix.shape

(8282, 30104)

In [49]:
overview_matrix = overview_matrix.toarray()
overview_df = pd.DataFrame(overview_matrix)

In [50]:
data.shape

(8282, 30)

In [51]:
from sklearn.decomposition import PCA
num_components = 1000

# Applying PCA for dimensionality reduction
pca = PCA(n_components=num_components)
pca_data = pca.fit_transform(overview_df)

In [52]:
pca_data = pd.DataFrame(pca_data)

In [53]:
pca_data.shape

(8282, 1000)

In [54]:
combined_features = pd.concat([pca_data,source_df,type_df,genres_df,demographics_df,themes_df],axis=1)

In [55]:
combined_features.shape

(8282, 1100)

In [56]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(combined_features)

In [57]:
def recommend(anime):
    index = data[(data['title'] == anime) | (data['title_english'] == anime)].index[0]
    distances = sorted(list(enumerate(similarity_matrix[index])),reverse=True,key= lambda x:x[1])
    for i in distances[1:10]:
        
        print(data.iloc[i[0]].title,"---",i[1])

In [58]:
similarity_matrix.shape

(8282, 8282)

In [59]:
recommend("Kimetsu no Yaiba")

Kimetsu no Yaiba: Katanakaji no Sato-hen --- 0.903423512700235
Kimetsu no Yaiba: Yuukaku-hen --- 0.899187587133926
Kimetsu no Yaiba: Mugen Ressha-hen --- 0.8924210926554421
Nokemono-tachi no Yoru --- 0.814109839130209
Senkaiden Houshin Engi --- 0.8068999284936963
Jujutsu Kaisen --- 0.8049670809151329
Kuroshitsuji II --- 0.7674770044027059
Vanitas no Karte Part 2 --- 0.7652391631556102
Orient: Awajishima Gekitou-hen --- 0.7636686633800215
Orient --- 0.7604917951896554
Kuroshitsuji: Book of Circus --- 0.7598097775516753
Vanitas no Karte --- 0.7595739087958543
Sengoku Youko: Yonaoshi Kyoudai-hen --- 0.7555765453045107
Kimetsu no Yaiba Movie: Mugen Ressha-hen --- 0.7523948189420188
Kuroshitsuji --- 0.7517918804587607
Ragna Crimson --- 0.7369640768207604
Chainsaw Man --- 0.7340667811317794
Yu☆Gi☆Oh! Zexal Second --- 0.7325426954989877
Jigokuraku --- 0.7277655574252128


In [251]:
recommend("Kuroko no Basket 2nd Season")

Kuroko no Basket --- 0.9691568972602083
Kuroko no Basket 3rd Season --- 0.9687037260848341
Slam Dunk --- 0.9439673446049535
Diamond no Ace --- 0.93550232388149
Ahiru no Sora --- 0.934448680491423
Diamond no Ace: Second Season --- 0.9340136132602974
Haikyuu!! To the Top Part 2 --- 0.9302775634989314
Diamond no Ace: Act II --- 0.9265694734604045
Haikyuu!! Second Season --- 0.9256945364715556
Haikyuu!! --- 0.9216046686427005
Haikyuu!! Karasuno Koukou vs. Shiratorizawa Gakuen Koukou --- 0.9214556282627179
Whistle! --- 0.9211156944114792
Days (TV) --- 0.9187966364459417
Haikyuu!! To the Top --- 0.9161523672294373
Boukyaku Battery (TV) --- 0.9146680564328475
Eyeshield 21 --- 0.855457549995078
Major 2nd 2nd Season --- 0.8516103184401271
Area no Kishi --- 0.8505417088329088
Blue Lock --- 0.8493604356880786


In [252]:
recommend("One Piece")

Magi: Sinbad no Bouken (TV) --- 0.9452598135230995
Nanatsu no Taizai: Mokushiroku no Yonkishi --- 0.9443174234699682
Magi: The Labyrinth of Magic --- 0.9436854209752565
Dragon Quest: Dai no Daibouken (TV) --- 0.9430641750624058
Magi: The Kingdom of Magic --- 0.9374976055778587
Fairy Tail (2014) --- 0.9346276844349698
Bleach: Sennen Kessen-hen --- 0.933754737332413
Bleach: Sennen Kessen-hen - Ketsubetsu-tan --- 0.9325351125259214
Nanatsu no Taizai: Seisen no Shirushi --- 0.9319725272022331
Nanatsu no Taizai: Imashime no Fukkatsu --- 0.9303485466587573
Fairy Tail: Final Series --- 0.9302970043233261
Hunter x Hunter --- 0.9298131615300326
Hunter x Hunter (2011) --- 0.928512205875408
Fairy Tail --- 0.9280766630761886
Nanatsu no Taizai: Kamigami no Gekirin --- 0.9280511388452618
Nanatsu no Taizai: Funnu no Shinpan --- 0.9244205861938114
Dragon Quest: Dai no Daibouken (2020) --- 0.9243487350851174
Bleach --- 0.9235907784165057
Nanatsu no Taizai --- 0.9220673434788279


**Saving the model and anime_data**

In [253]:
import pickle
pickle.dump(data,open('anime.pkl','wb'))
pickle.dump(similarity_matrix,open('similarity.pkl','wb'))