In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [2]:
anime = pd.read_csv("/Users/utkarshhajare/DS material/Assignements/Recommendation System/anime.csv")

# Data Preprocessing

In [3]:
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
anime.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

### Handle missing values

In [5]:
anime['genre'] = anime['genre'].fillna('')
anime['rating'] = anime['rating'].fillna(anime['rating'].mean())

# Feature Extraction

### Create binary matrix for genres

In [6]:
genres = anime['genre'].str.get_dummies(', ')
ratings = anime['rating'].values.reshape(-1, 1)

### Normalize ratings

In [7]:
scaler = MinMaxScaler()
normalized_ratings = scaler.fit_transform(ratings)

### Combine genres and ratings

In [8]:
features = np.hstack([genres.values, normalized_ratings])

# Recommendation System

### Compute cosine similarity matrix

In [9]:
cosine_sim = cosine_similarity(features)

### Recommend for 'Kimi no Na wa.

In [10]:
anime_title = 'Kimi no Na wa.'
idx = anime.index[anime['name'] == anime_title].tolist()[0]

### Get similarity scores, sort, and filter

In [11]:
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores_05 = sim_scores[:5]  # Top 5 for threshold ~0.5 (simplified)
sim_scores_07 = [score for score in sim_scores if score[1] > 0.7][:5]  # Top 5 for threshold 0.7
anime_indices_05 = [i[0] for i in sim_scores_05]
anime_indices_07 = [i[0] for i in sim_scores_07]
recommendations_05 = anime['name'].iloc[anime_indices_05].tolist()
recommendations_07 = anime['name'].iloc[anime_indices_07].tolist()

In [12]:
print("\nRecommendations for 'Kimi no Na wa.' (threshold ~0.5):", recommendations_05)
print("Recommendations for 'Kimi no Na wa.' (threshold 0.7):", recommendations_07)


Recommendations for 'Kimi no Na wa.' (threshold ~0.5): ['Kimi no Na wa.', 'Wind: A Breath of Heart OVA', 'Wind: A Breath of Heart (TV)', 'Aura: Maryuuin Kouga Saigo no Tatakai', 'Kokoro ga Sakebitagatterunda.']
Recommendations for 'Kimi no Na wa.' (threshold 0.7): ['Kimi no Na wa.', 'Wind: A Breath of Heart OVA', 'Wind: A Breath of Heart (TV)', 'Aura: Maryuuin Kouga Saigo no Tatakai', 'Kokoro ga Sakebitagatterunda.']


# Evaluation

### Split data into train and test

In [13]:
train_data, test_data = train_test_split(anime, test_size=0.2, random_state=42)

### Evaluate for first test anime

In [14]:
test_idx = test_data.index[0]
test_anime_genres = set(anime.iloc[test_idx]['genre'].split(', '))
relevant_anime = anime.index[anime['genre'].apply(lambda x: bool(set(x.split(', ')) & test_anime_genres))].tolist()

### Get recommendations for test anime

In [16]:
test_anime_title = anime.iloc[test_idx]['name']
test_idx_all = anime.index[anime['name'] == test_anime_title].tolist()[0]  # Assume anime exists
sim_scores_test = list(enumerate(cosine_sim[test_idx_all]))
sim_scores_test = sorted(sim_scores_test, key=lambda x: x[1], reverse=True)
sim_scores_test = sim_scores_test[:5]  # Top 5 for evaluation
recommended_indices = [i[0] for i in sim_scores_test]

### Calculate metrics using set operations

In [17]:
true_positives = len(set(recommended_indices).intersection(set(relevant_anime)))
precision = true_positives / len(recommended_indices)
recall = true_positives / len(relevant_anime)
f1 = 2 * (precision * recall) / (precision + recall + 1e-10) # Avoid division by zero

In [18]:
print("\nEvaluation Metrics for", test_anime_title, ":")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")


Evaluation Metrics for Suzy&#039;s Zoo: Daisuki! Witzy - Happy Birthday :
Precision: 1.00
Recall: 0.00
F1-Score: 0.01


# Interview Questions

### 1. Can you explain the difference between user-based and item-based collaborative filtering?
* **User-based:** Recommends items to a user by finding other users with similar tastes. For example, if you and your friend like the same anime, the system suggests anime your friend watched but you haven’t
* **Item-based:** Recommends items similar to ones you already liked. For example, if you liked “Naruto,” it suggests anime like “Bleach” because they have similar genres or ratings.

### 2. What is collaborative filtering, and how does it work?
* **What it is:** Collaborative filtering is a technique used by recommendation systems to predict what a user will like based on the preferences or tastes of other users (or the similarity between items). It "filters" information by collaborating among different users' data.
* **How it works:** It works on the idea that if two people agree on the ratings of an item, they are likely to agree on the ratings of other items as well. Or, if two items are often liked by the same people, they are probably similar. It builds a model from a user's past behavior (like items they bought or rated) and similar decisions made by other users. It doesn't need to know anything about the items themselves (like their genres or actors) unless it's a "hybrid" system. It focuses on the patterns of interaction between users and items.