In [1]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Data Preprocessing:

##### Loading the dataset

In [2]:
data = pd.read_csv('anime.csv')
data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
data.shape

(12294, 7)

In [4]:
data.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
# Handling thr missing values
data.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [7]:
data['genre'] = data['genre'].fillna('unknown')
data['type'] = data['type'].fillna(0)
data['rating'] = data['rating'].fillna(data['rating'].mean())

In [8]:
data.isna().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [9]:
data.duplicated().sum()

np.int64(0)

##### Explore the dataset

In [10]:
data.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12294.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.017096,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.9,225.0
50%,10260.5,6.55,1550.0
75%,24794.5,7.17,9437.0
max,34527.0,10.0,1013917.0


In [11]:
data.drop('anime_id',inplace=True,axis=1)

In [12]:
data.nunique()

name        12292
genre        3265
type            7
episodes      187
rating        599
members      6706
dtype: int64

In [13]:
data['type'].value_counts()

type
TV         3787
OVA        3311
Movie      2348
Special    1676
ONA         659
Music       488
0            25
Name: count, dtype: int64

#### Feature Extraction:

##### Decide on the features

###### we are using genre and converting to binary and using rating and members
###### these three features are using for computing

##### Convert categorical features into numerical

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer

In [15]:
#split genre strings into lists

data['genre'] = data['genre'].fillna('').apply(lambda x:x.split(','))

In [16]:
#convert genre into binary matrix

mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(data['genre'])

In [18]:
#create dataframe of genre features

genre_df = pd.DataFrame(genre_features,columns=mlb.classes_)

In [19]:
#select numeric features

num_features = data[['rating','members']]

##### Normalize numerical features

In [20]:
from sklearn.preprocessing import MinMaxScaler

In [21]:
scaler = MinMaxScaler()
numeric_scaled = scaler.fit_transform(num_features)

In [22]:
#convert back to dataframe

numeric_df = pd.DataFrame(numeric_scaled,columns=num_features.columns)

In [23]:
#combine genre and numeric features

features = np.concatenate([genre_df.values,numeric_df.values],axis=1)
features.shape

(12294, 85)

#### Recommendation System:

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
cosine_sim = cosine_similarity(features)
cosine_sim.shape

(12294, 12294)

In [27]:
def recommend_anime(anime_name,similarity_matrix,data,thershold=0.5,top_n=10):
    #check if anime exists
    if anime_name not in data['name'].values:
        return 'anime not found'

    #get index of the anime
    idx = data[data['name'] == anime_name].index[0]

    #get similarity score for this anime
    sim_scores = list(enumerate(similarity_matrix[idx]))

    #filter by thershold and sort by score
    sim_scores = [(i,score) for i,score in sim_scores if score >= thershold and i != idx]
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse = True)

    #get top N similar anime
    top_indices = [i for i, _ in sim_scores[:top_n]]

    #return their names and scores
    results = data.iloc[top_indices][['name','rating','members']]
    results['similarity_score'] = [score for _,score in sim_scores[:top_n]]

    return results

In [28]:
recommendations = recommend_anime('Kimi no Na wa.',cosine_sim,data,thershold=0.4,top_n=5)
print(recommendations)

                                                   name  rating  members  \
5805                        Wind: A Breath of Heart OVA    6.35     2043   
6394                       Wind: A Breath of Heart (TV)    6.14     7778   
504   Clannad: After Story - Mou Hitotsu no Sekai, K...    8.02   138364   
208                       Kokoro ga Sakebitagatterunda.    8.32    59652   
1201                     Angel Beats!: Another Epilogue    7.63   134180   

      similarity_score  
5805          0.983501  
6394          0.981822  
504           0.889102  
208           0.888324  
1201          0.887303  


In [29]:
# higher thershold = fewer but more similar anime
recommendations_strict = recommend_anime('Kimi no Na wa.',cosine_sim,data,thershold=0.7)
print(recommendations_strict)

# lower thershold = larger more diverse list
recommendations_loose = recommend_anime('Kimi no Na wa.',cosine_sim,data,thershold=0.3)
print(recommendations_loose)

                                                   name  rating  members  \
5805                        Wind: A Breath of Heart OVA    6.35     2043   
6394                       Wind: A Breath of Heart (TV)    6.14     7778   
504   Clannad: After Story - Mou Hitotsu no Sekai, K...    8.02   138364   
208                       Kokoro ga Sakebitagatterunda.    8.32    59652   
1201                     Angel Beats!: Another Epilogue    7.63   134180   
1435                                         True Tears    7.55   118644   
1907                                   Myself; Yourself    7.41   115075   
1631                                Kimikiss Pure Rouge    7.48    58211   
2300                         Koi to Senkyo to Chocolate    7.30    91552   
1494                                           Harmonie    7.52    29029   

      similarity_score  
5805          0.983501  
6394          0.981822  
504           0.889102  
208           0.888324  
1201          0.887303  
1435         

#### Evaluation:

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,recall_score,f1_score

In [31]:
data['liked'] = (data['rating'] >= 8).astype(int)

In [32]:
train_data,test_data,train_idx,test_idx=train_test_split(data,data.index,test_size=0.2,random_state=42)

In [33]:
train_features = features[train_idx]
test_features = features[test_idx]

In [34]:
cosine_sim_train = cosine_similarity(train_features)

In [37]:
def evaluate_recommender(train_data,test_data,cosine_sim,top_n=10):
    y_true = []
    y_pred = []

    for i in range(len(test_data)):
        #true label
        y_true.append(test_data.iloc[i]['liked'])

        #get top N similar anime from training set
        sim_scores = list(enumerate(cosine_sim[i % len(train_data)]))
        sim_scores = sorted(sim_scores,key=lambda x:x[1],reverse=True)
        top_indices = [idx for idx, _ in sim_scores[:top_n]]

        #predicted liked if any of top N similar anime are liked
        pred = 1 if any (train_data.iloc[top_indices]['liked']==1) else 0
        y_pred.append(pred)

    #calculated metrics
    precision = precision_score(y_true,y_pred)
    recall =recall_score(y_true,y_pred)
    f1=f1_score(y_true,y_pred)

    print(f'precision:{precision}')
    print(f'recall:{recall}')
    print(f'f1-score:{f1}')

In [36]:
evaluate_recommender(train_data,test_data,cosine_sim_train,top_n=10)

precision:0.06133828996282528
recall:0.2426470588235294
f1-score:0.09792284866468842


## Interview Questions:

### 1. Can you explain the difference between user-based and item-based collaborative filtering?
     User-based collaborative filtering recommends items by finding users with similar tastes and suggesting what they liked
     Item-based collaborative filtering recommends items similar to those the user has already liked

### 2.What is collaborative filtering, and how does it work?
     Collaborative filtering is a recommendation technique that predicts a user's interests by analyzing preferences of similar users or items.
     It works by using patterns in user-item interactions (like ratings or views) to suggest items that people with similar behavior liked.