# Loading Data & EDA

In [1]:
import pandas as pd

anime = pd.read_csv("anime.csv")

anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [2]:
# Checking for missing values
anime.isnull().sum()


anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [3]:
# Handling missing data
anime['genre'] = anime['genre'].fillna("")
anime['type'] = anime['type'].fillna("")

# Combining text content into one column
anime['text_content'] = ( anime['genre'] + " " + anime['type'] )

anime[['name','text_content']].head()

Unnamed: 0,name,text_content
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural Movie"
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S..."
3,Steins;Gate,"Sci-Fi, Thriller TV"
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S..."


# Vectorization using TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
# removing stop words
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)

# Fit and transform the content column
tfidf_matrix = tfidf.fit_transform(anime['text_content'])

tfidf_matrix.shape

(12294, 51)

# Scaling numeric features

In [7]:
from sklearn.preprocessing import MinMaxScaler

# Filling missing numeric values
anime[['rating', 'members']] = anime[['rating', 'members']].fillna(0)

scaler = MinMaxScaler()

numeric_features = scaler.fit_transform(anime[['rating', 'members']])

numeric_features[:5]   

array([[0.937     , 0.1978722 ],
       [0.926     , 0.7827701 ],
       [0.925     , 0.11268927],
       [0.917     , 0.66432491],
       [0.916     , 0.14918553]])

# Combing text and numeric features

In [8]:
from scipy.sparse import hstack

# Combining sparse TF-IDF matrix with dense numeric features
content_matrix = hstack([tfidf_matrix, numeric_features])

content_matrix.shape

(12294, 53)

# Compute cosine similarity

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(content_matrix, content_matrix)

cosine_sim.shape


(12294, 12294)

# Recommendation System

In [20]:
def recommend_anime(title, cosine_sim=cosine_sim):
    # Check if anime exists
    if title not in anime['name'].values:
        return "Anime not found in dataset!"
    
    # Get index of the anime
    idx = anime[anime['name'].str.lower() == title.lower()].index[0]
    
    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score, highest first
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Take top 10 similar anime (skip first because it's the anime itself)
    sim_scores = sim_scores[1:11]
    
    # Extract indices
    anime_indices = [i[0] for i in sim_scores]
    
    # Return results
    return anime.iloc[anime_indices]["name"].reset_index(drop=True)




# Test

In [21]:
recommend_anime("Steins;Gate")

0           Steins;Gate Movie: Fuka Ryouiki no Déjà vu
1                Steins;Gate: Oukoubakko no Poriomania
2    Steins;Gate: Kyoukaimenjou no Missing Link - D...
3                                      Higashi no Eden
4                                        Under the Dog
5                                 Fate/Zero 2nd Season
6                                    Zankyou no Terror
7                                           Gankutsuou
8                           Mahou Shoujo Madoka★Magica
9                Re:Zero kara Hajimeru Isekai Seikatsu
Name: name, dtype: object