                                                RECOMMENDATIONS SYSTEMS

Load the Dataset :

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
anime=pd.read_csv("anime.csv")
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


Basic Data Exploration : 

In [3]:
anime.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [4]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
anime.shape

(12294, 7)

In [6]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


Handling Missing Values :

In [9]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [23]:
anime['genre'] = anime['genre'].fillna('Unknown').astype(str)
anime['type'] = anime['type'].fillna('Unknown').astype(str)


In [25]:
anime['rating'] = pd.to_numeric(anime['rating'], errors='coerce')


In [26]:
anime['rating'] = anime['rating'].fillna(anime['rating'].mean())

In [27]:
anime.dtypes
anime.isnull().sum()


anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [28]:
# Missing values are handled

FEATURE EXTRACTION :

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
genre_features = tfidf.fit_transform(anime['genre'])


Combine Text + Numerical Features :

In [46]:
anime['combined_text'] = anime['genre'] + ' ' + anime['type']


Normalize Numerical Features :

In [37]:
anime['episodes'] = anime['episodes'].replace('Unknown', np.nan)
anime['episodes'] = pd.to_numeric(anime['episodes'], errors='coerce')
anime['episodes'] = anime['episodes'].fillna(0)


In [38]:
anime['episodes'].isnull().sum()
anime['episodes'].dtype


dtype('float64')

MinMax Scaling for Numerical Features :

In [39]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(
    anime[['rating', 'episodes', 'members']]
)


Combine All Features:

In [47]:
from scipy.sparse import hstack

final_features = hstack([text_features, numerical_features])


Cosine Similarity :

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(final_features)


Recommendation Function:

In [48]:
def recommend_anime(anime_name, df, similarity_matrix, top_n=5, threshold=0.3):
    if anime_name not in df['name'].values:
        return "Anime not found in dataset"

    idx = df[df['name'] == anime_name].index[0]

    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    filtered_scores = [
        score for score in similarity_scores[1:]
        if score[1] >= threshold
    ][:top_n]

    recommendations = df.iloc[[i[0] for i in filtered_scores]][
        ['name', 'genre', 'rating', 'type']
    ]

    return recommendations


In [49]:
recommend_anime("Naruto", anime, cosine_sim, top_n=5)


Unnamed: 0,name,genre,rating,type
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",7.94,TV
206,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar...",8.32,TV
346,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",8.16,TV
588,Dragon Ball Kai,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.95,TV
1930,Dragon Ball Super,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.4,TV


Threshold Experimentation :

In [50]:
recommend_anime("Naruto", anime, cosine_sim, threshold=0.2)
recommend_anime("Naruto", anime, cosine_sim, threshold=0.4)
recommend_anime("Naruto", anime, cosine_sim, threshold=0.6)


Unnamed: 0,name,genre,rating,type
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",7.94,TV
206,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar...",8.32,TV
346,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",8.16,TV
588,Dragon Ball Kai,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.95,TV
1930,Dragon Ball Super,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.4,TV


In [51]:
#Higher threshold → better quality but fewer results
#Lower threshold → more results but less precise

Performance Analysis & Improvement Areas :

In [None]:


#Strengths

#Simple and fast
#No user interaction data required
#Effective for content similarity
#Easy to interpret

#Limitations

#No personalization
#Cold-start problem
#Depends heavily on feature quality
#Cannot learn evolving user preferences

Areas of Improvement :

In [52]:
#Hybrid Recommendation System

#Combine content-based + collaborative filtering
#Better Text Representation
#Use Word2Vec, FastText, or BERT embeddings

#Feature Weighting

#Give higher weight to genre than rating
#User Feedback Integration
#Incorporate user ratings and watch history

#Dimensionality Reduction

#Apply PCA to reduce sparsity and noise

Interview questions :

In [54]:
#  Type	                                                         How it works	
#User-Based  :	Finds users who are similar to the target user based on their past ratings/behaviors, then recommends items liked by similar users.
                #User-based is more dynamic but slower on large datasets

#Item-Based	:   Finds items similar to the ones the target user liked, based on the ratings of all users, then recommends similar items.
                #Item-based is more scalable and stable over time

Collaborative filtering :

In [55]:
#Collaborative filtering is a recommender system technique that suggests items to a user based on the preferences or behaviors of many users.
#It does not require content features, only the user-item interaction matrix.

#How it is Working:

#Build a user-item matrix (rows = users, columns = items, values = ratings or interactions).

#Measure similarity:
#Between users → user-based
#Between items → item-based

#Common metrics: Cosine similarity, Pearson correlation
#Predict ratings or preferences for unknown items using weighted averages of similar users/items.
#Recommend top-N items with the highest predicted rating.
