## Anime Recommendation System Assignment


In [115]:
import pandas as pd
import numpy as np
df = pd.read_csv("anime.csv")

In [116]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [118]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [119]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [120]:
# Dataset has columns like: anime_id, name, genre, type, episodes, rating, members.
# Some null values are present, especially in rating and genre,type.

## Handle Missing Values

In [121]:
# Check missing values
df.isnull().sum()
# Fill missing 'genre' with 'Unknown'
df['genre'] = df['genre'].fillna('Unknown')
df['type'] = df['type'].fillna('Unknown')
# Fill missing 'rating' and 'members' with their median values
df['rating'] = df['rating'].fillna(df['rating'].median())
df['members'] = df['members'].fillna(df['members'].median())

In [122]:
df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [123]:
# We fill missing values to avoid errors during vectorization or modeling:
# Replace missing genres and types with 'Unknown'.
# Replace missing ratings with the median rating to preserve central tendency without being skewed by outliers.

## feature extraction and scaling

In [124]:
# Vectorize genre column
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
genre_matrix = vectorizer.fit_transform(df['genre'])



In [125]:
from sklearn.preprocessing import MinMaxScaler
# Create genre matrix (multi-label binarization)
genre_dummies = df['genre'].str.get_dummies(sep=", ")
# Normalize numerical features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df[['rating', 'members']])
scaled_df = pd.DataFrame(scaled_features, columns=['rating_scaled', 'members_scaled'])
# Combine genre and scaled numerical features
feature_matrix = pd.concat([genre_dummies, scaled_df], axis=1)

In [126]:
# Genres are tokenized and vectorized for cosine similarity calculation.

# Cosine similarity allows identifying anime with similar genres.

## Compute Cosine Similarity

In [127]:
cosine_sim = cosine_similarity(feature_matrix)
anime_indices = pd.Series(df.index, index=df['name']).drop_duplicates()

In [128]:
cosine_sim

array([[1.        , 0.31070403, 0.13939258, ..., 0.15027155, 0.15431875,
        0.17306034],
       [0.31070403, 1.        , 0.35855939, ..., 0.11281034, 0.11583922,
        0.12989711],
       [0.13939258, 0.35855939, 1.        , ..., 0.11686118, 0.12000991,
        0.1345863 ],
       ...,
       [0.15027155, 0.11281034, 0.11686118, ..., 1.        , 0.99994581,
        0.99824985],
       [0.15431875, 0.11583922, 0.12000991, ..., 0.99994581, 1.        ,
        0.99881138],
       [0.17306034, 0.12989711, 0.1345863 , ..., 0.99824985, 0.99881138,
        1.        ]])

## Recommendation Function

In [129]:
def recommend_anime(anime_name, top_n=10):
    idx = anime_indices[anime_name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    anime_indices_list = [i[0] for i in sim_scores]
    return df[['name', 'genre', 'rating', 'members']].iloc[anime_indices_list]


In [130]:
# The function returns top k similar anime based on genre.

## Evaluation Function

In [131]:
def evaluate_precision_recall(anime_name, k=10):
    idx = anime_indices[anime_name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:k+1]
    recommended_indices = [i[0] for i in sim_scores]
    
    target_genres = set(df.iloc[idx]['genre'].split(', '))
    
    relevant_count = 0
    for i in recommended_indices:
        if target_genres & set(df.iloc[i]['genre'].split(', ')):
            relevant_count += 1

    precision = relevant_count / k
    total_relevant_items = sum(1 for g in df['genre'] if target_genres & set(g.split(', ')))
    recall = relevant_count / total_relevant_items if total_relevant_items > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0
    
    return precision, recall, f1


In [132]:
# Precision: Fraction of recommended anime that are genre-relevant.

# Recall: Fraction of relevant anime that were recommended.

# F1-score: Harmonic mean of precision and recall.

## Test Evaluation for Different k Values

In [133]:
target = 'Naruto'
for k in [5, 10, 15, 20, 25, 30]:
    precision, recall, f1 = evaluate_precision_recall(target, k)
    print(f"k={k} => Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


k=5 => Precision: 1.0000, Recall: 0.0007, F1: 0.0014
k=10 => Precision: 1.0000, Recall: 0.0014, F1: 0.0028
k=15 => Precision: 1.0000, Recall: 0.0021, F1: 0.0043
k=20 => Precision: 1.0000, Recall: 0.0029, F1: 0.0057
k=25 => Precision: 1.0000, Recall: 0.0036, F1: 0.0071
k=30 => Precision: 1.0000, Recall: 0.0043, F1: 0.0085


In [134]:
# As k increases, recall increases, but precision may decrease.

In [135]:
# We developed a genre-based anime recommendation system using cosine similarity. 
# Evaluation was done using precision, recall, and F1-score. 
# The system performed well for top-5 recommendations, though recall remained low due to limited information (only genres).

## INTERVIEW QUESTIONS 

## 1. Can you explain the difference between user-based and item-based collaborative filtering?

In [136]:
# User-Based: Looks for similar users to make recommendations.
# Item-Based: Looks for similar items based on co-occurrence.
# User-Based Collaborative FilteringRecommends items liked by users similar to you.	
# Item-Based Collaborative Filtering Recommends items similar to what you liked.

## 2. What is collaborative filtering, and how does it work?


In [137]:
# Collaborative filtering is a popular recommendation technique that uses the behavior of users (like ratings, clicks, purchases) to make predictions about what a user will like.

# If User A and User B both liked Naruto, and User A also liked One Piece, we might recommend One Piece to User B — even if B never heard of it. That's collaborative filtering.

# Create a user-item interaction matrix (e.g., ratings or likes).

# Compute similarities between users or items (using cosine similarity, Pearson, etc.).

# Predict unknown ratings or preferences using these similarities.

# Recommend items with the highest predicted score.