## Data Preprocessing

In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('anime.csv')

In [4]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [142]:
df.size

86058

In [144]:
df.shape

(12294, 7)

In [6]:
df['genre'].unique()

array(['Drama, Romance, School, Supernatural',
       'Action, Adventure, Drama, Fantasy, Magic, Military, Shounen',
       'Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen',
       ..., 'Hentai, Sports', 'Drama, Romance, School, Yuri',
       'Hentai, Slice of Life'], dtype=object)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [8]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [9]:
df.shape

(12294, 7)

In [10]:
# Handle missing values
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [11]:
rating_mean=df['rating'].mean()


In [12]:
df['rating'].fillna(rating_mean,inplace=True)

In [13]:
df['type']=df['type'].fillna(method='pad')


In [14]:
df['genre']=df['genre'].fillna(method='pad')


In [15]:
df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [16]:
## Exploring the dataset
df.rating.value_counts()

rating
6.473902    230
6.000000    141
7.000000     99
6.500000     90
6.250000     84
           ... 
3.470000      1
3.710000      1
3.870000      1
3.910000      1
3.140000      1
Name: count, Length: 599, dtype: int64

In [17]:
df.type.value_counts()

type
TV         3801
OVA        3315
Movie      2353
Special    1677
ONA         660
Music       488
Name: count, dtype: int64

In [18]:
df.genre.value_counts()

genre
Hentai                                                  823
Comedy                                                  525
Music                                                   304
Kids                                                    204
Comedy, Slice of Life                                   183
                                                       ... 
Adventure, Drama, Fantasy, Game, Sci-Fi                   1
Adventure, Demons, Fantasy, Historical                    1
Action, Comedy, Drama, Mecha, Music, Sci-Fi, Shounen      1
Action, Comedy, Fantasy, Mecha, Sci-Fi, Shounen           1
Hentai, Slice of Life                                     1
Name: count, Length: 3264, dtype: int64

In [19]:
df.groupby('genre')['rating'].mean().sort_values(ascending=False)

genre
Action, Adventure, Drama, Fantasy, Magic, Military, Shounen    9.26
Drama, Fantasy, Romance, Slice of Life, Supernatural           9.06
Drama, School, Shounen                                         9.05
Adventure, Drama, Supernatural                                 8.93
Drama, Music, Romance, School, Shounen                         8.92
                                                               ... 
Action, Demons, Seinen, Super Power                            3.32
Kids, Mecha                                                    3.26
Action, Adventure, Sci-Fi, Space, Super Power                  3.25
Dementia, Kids                                                 3.21
Comedy, Dementia, Fantasy, Horror, Music, Parody               2.55
Name: rating, Length: 3264, dtype: float64

In [20]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [21]:
len(df.anime_id.unique())

12294

## Feature Extraction

In [23]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [24]:
df1=df[['name','genre','rating']]

In [25]:
df1.head()

Unnamed: 0,name,genre,rating
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.25
3,Steins;Gate,"Sci-Fi, Thriller",9.17
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",9.16


In [26]:
len(df1.name.unique())

12292

In [27]:
df2=df1.pivot_table(index='name',columns='genre',values='rating')
df2.fillna(0,axis=1,inplace=True)

In [28]:
df2.head()

genre,Action,"Action, Adventure","Action, Adventure, Cars, Comedy, Sci-Fi, Shounen","Action, Adventure, Cars, Mecha, Sci-Fi, Shounen, Sports","Action, Adventure, Cars, Sci-Fi","Action, Adventure, Comedy","Action, Adventure, Comedy, Demons, Drama, Ecchi, Horror, Mystery, Romance, Sci-Fi","Action, Adventure, Comedy, Demons, Fantasy, Magic","Action, Adventure, Comedy, Demons, Fantasy, Magic, Romance, Shounen, Supernatural","Action, Adventure, Comedy, Demons, Fantasy, Martial Arts, Shounen, Super Power",...,Slice of Life,"Slice of Life, Space","Slice of Life, Supernatural",Space,Sports,"Super Power, Supernatural, Vampire",Supernatural,Thriller,Vampire,Yaoi
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;0&quot;,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
df2.shape

(12292, 3264)

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
similarities=cosine_similarity(df2)
similarities

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [62]:
import numpy as np

In [64]:
np.unique(similarities)

array([0.        , 0.67232047, 0.6862307 , 0.72738396, 0.74026022,
       1.        , 1.        , 1.        ])

In [66]:
len(np.unique(similarities))

8

In [84]:
np.where('Kimi no Na wa.'==df2.index)[0][0] ## gives index of the name in the column 'Name'

5412

In [92]:
similar_movie=sorted(list(enumerate(similarities[5])),key=lambda x:x[1],reverse=True)[1:6]
similar_movie

[(6511, 1.0), (6512, 1.0), (6513, 1.0), (0, 0.0), (1, 0.0)]

In [94]:
df2.index

Index(['&quot;0&quot;',
       '&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu',
       '&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi',
       '&quot;Bungaku Shoujo&quot; Memoire',
       '&quot;Bungaku Shoujo&quot; Movie', '&quot;Eiji&quot;',
       '&quot;Eiyuu&quot; Kaitai', '.hack//G.U. Returner',
       '.hack//G.U. Trilogy', '.hack//G.U. Trilogy: Parody Mode',
       ...
       's.CRY.ed', 'vivi', 'xxxHOLiC', 'xxxHOLiC Kei',
       'xxxHOLiC Movie: Manatsu no Yoru no Yume', 'xxxHOLiC Rou',
       'xxxHOLiC Shunmuki', 'Üks Uks', 'ēlDLIVE', '◯'],
      dtype='object', name='name', length=12292)

## Recommendation System:

#
Recommendation System:

Design a function to recommend anime based on cosine similarity.
Given a target anime, recommend a list of similar anime based on cosine similarity scores.
Experiment with different threshold values for similarity scores to adjust the recommendation list size.



In [126]:
def recommend(anime_name):
    if anime_name in df2.index:
        index=np.where(anime_name==df2.index)[0][0]
        similar_anime=sorted(list(enumerate(similarities[index])),key=lambda x:x[1],reverse=True)[1:7]
        print(f'recommended anime for{anime_name}')
        print('-'*10)
        for anime in similar_anime:
            print(df2.index[anime[0]])
    else:
            print('anime is not in the list')
            
        

In [130]:
recommend('&quot;0&quot;')

recommended anime for&quot;0&quot;
----------
2010
3-D Tengoku
4-Day Weekend
7-kakan.
8-gatsu no Symphony: Shibuya 2002-2003
A Play


In [132]:
recommend('&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu')

recommended anime for&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu
----------
12-gatsu no Uta
Aesop&#039;s World
Agigongryong Doolie (1988)
Agigongryong Doolie (Movie)
Alice Tantei Kyoku
Ama Gli Animali


## Interview Questions:

## 
Interview Questions:
1. Can you explain the difference between user-based and item-based collaborative filtering?
2. What is collaborative filtering, and how does it work?

In [None]:
2.What is collaborative filtering, and how does it work?
ANS:Collaborative filtering is an information retrieval method used by recommender systems. Let me break it down for you:

Grouping Users Based on Behavior:
Collaborative filtering groups users based on their preferences and behavior.
It assumes that if User A has similar tastes to User B on one issue, they are likely to have similar opinions on other issues as well.
For example, if User A and User B both like certain movies, collaborative filtering predicts that they will have similar preferences for other movies1.
Recommendations Based on Similar Users:
The system uses information from many users to make personalized predictions for a target user.
Instead of giving a generic score for each item, collaborative filtering recommends items based on how similar users have interacted with those items.
For instance, if User A and User B share similar tastes and User B liked a specific movie, the system might recommend that movie to User A1.
