In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load dataset

In [6]:
df = pd.read_csv('data/anime_dataset.csv')
df.head()

Unnamed: 0,title,score,genres,episodes,synopsis,popularity,members,studios,year
0,Attack on Titan,8.57,"['Action', 'Award Winning', 'Drama', 'Suspense...",25.0,"Centuries ago, mankind was slaughtered to near...",1,4245518,['Wit Studio'],2013.0
1,Death Note,8.62,"['Supernatural', 'Suspense', 'Psychological', ...",37.0,"Brutal murders, petty thefts, and senseless vi...",2,4186098,['Madhouse'],2006.0
2,Fullmetal Alchemist: Brotherhood,9.1,"['Action', 'Adventure', 'Drama', 'Fantasy', 'M...",64.0,After a horrific alchemy experiment goes wrong...,3,3588803,['Bones'],2009.0
3,One-Punch Man,8.48,"['Action', 'Comedy', 'Adult Cast', 'Parody', '...",12.0,The seemingly unimpressive Saitama has a rathe...,4,3443899,['Madhouse'],2015.0
4,Demon Slayer: Kimetsu no Yaiba,8.42,"['Action', 'Award Winning', 'Supernatural', 'H...",26.0,"Ever since the death of his father, the burden...",5,3340293,['ufotable'],2019.0


In [7]:
df.isnull().sum()

title           0
score           0
genres          0
episodes        3
synopsis        0
popularity      0
members         0
studios         0
year          168
dtype: int64

In [10]:
df.duplicated().sum()

np.int64(0)

## Data Cleaning

In [16]:
df = df.dropna(subset=['episodes'])
df['year'] = df['year'].fillna('Unknown')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1047 entries, 0 to 1049
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       1047 non-null   object 
 1   score       1047 non-null   float64
 2   genres      1047 non-null   object 
 3   episodes    1047 non-null   float64
 4   synopsis    1047 non-null   object 
 5   popularity  1047 non-null   int64  
 6   members     1047 non-null   int64  
 7   studios     1047 non-null   object 
 8   year        1047 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 81.8+ KB


## modelling

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [38]:
df['genres'] = df['genres'].fillna('')
df['studios'] = df['studios'].fillna('')
df['synopsis'] = df['synopsis'].fillna('')

# Combine Text Features for TF-IDF
df['combined_features'] = (
    df['genres'] + " " + df['studios'] + " " + df['synopsis']
)

# TF-IDF Vectorization
# stop_words='english' removes common words like "the", "and", "is".
# max_features limits to the top 5000 most informative words.
tfidf = TfidfVectorizer(stop_words='english',max_features=5000)

# Transform our combined text into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Compute Similarity Between All Anime
# Cosine similarity measures how similar two vectors are.
# It outputs a matrix of pairwise similarity scores.
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

# Build an Index for Quick Lookup by Title
# Create a lowercase version for case-insensitive matching
df['title_lower'] = df['title'].str.lower()
indices = pd.Series(df.index, index=df['title_lower']).drop_duplicates()

TF-IDF matrix shape: (1047, 5000)


## Recommendation function

In [39]:
def recommend(title):
    title_lower = title.lower()
    if title_lower not in indices:
        return "Anime not found, check spelling."
        
    idx = indices[title_lower]

     # Get similarity scores for this anime with all others
    sim_scores = list(enumerate(cosine_sim[idx]))

     # Sort the anime based on similarity scores (descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

     # Skip the first one (itself) and select top 5 similar anime
    sim_scores = sim_scores[1:6]

      # Get the indices of those anime
    anime_indices = [i[0] for i in sim_scores]

    # Return relevant columns for display
    return df[['title', 'score', 'genres', 'year']].iloc[anime_indices]
    

In [42]:
Example_title = "death note"
recommend(Example_title)

Unnamed: 0,title,score,genres,year
63,Soul Eater,7.85,"['Action', 'Comedy', 'Fantasy', 'School', 'Sho...",2008.0
50,Death Parade,8.13,"['Drama', 'Fantasy', 'Suspense', 'Adult Cast',...",2015.0
725,B: The Beginning,7.14,"['Action', 'Mystery', 'Suspense', 'Detective',...",Unknown
152,K-On!,7.86,"['Comedy', 'CGDCT', 'Music', 'School']",2009.0
333,Bungo Stray Dogs 3,8.2,"['Action', 'Mystery', 'Adult Cast', 'Detective...",2019.0
