In [135]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [136]:
df = pd.read_csv("anime.csv")
print(df.shape)
print(df.info())
df.head()

(12294, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


# 1. Data Processing

In [137]:
len(df["genre"].unique())

3265

In [138]:
print(df.isnull().sum())
print(df.duplicated().sum())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64
0


In [139]:
# 1. Handling missing values

df.fillna({
    "genre":df.genre.mode()[0],
    "type":df.type.mode()[0],
    "rating":df.rating.mean(),
}, inplace = True)

df["episodes"].replace("Unknown", 0, inplace = True)
episodes = [int(val) for val in df["episodes"]]
df["episodes"] = episodes
df["episodes"].replace(0, np.mean(episodes), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["episodes"].replace("Unknown", 0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["episodes"].replace(0, np.mean(episodes), inplace = True)


In [140]:
print(df.isnull().sum())

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [141]:
# 2. Exploring the data
df.groupby("type")["rating"].agg(["mean","median","sum","min","max"])


Unnamed: 0_level_0,mean,median,sum,min,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Movie,6.321443,6.473902,14842.748986,1.92,10.0
Music,5.588996,5.625,2727.43,3.28,8.38
ONA,5.65212,5.77,3724.747312,2.58,8.26
OVA,6.375996,6.38,21110.921444,2.0,9.25
Special,6.523353,6.62,10933.139508,1.67,8.66
TV,6.886453,6.9,26251.160138,2.67,9.6


In [142]:
df.groupby("type")["members"].agg(["sum","min","max"])

Unnamed: 0_level_0,sum,min,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Movie,24326080,5,466254
Music,640178,24,71136
ONA,2711146,25,144898
OVA,19820109,17,305165
Special,12865078,24,160423
TV,161806449,11,1013917


In [143]:
df.groupby("type")["episodes"].agg(["sum","min","max"])

Unnamed: 0_level_0,sum,min,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Movie,2632.160403,1.0,100.0
Music,563.040101,1.0,24.0
ONA,4769.84464,1.0,84.0
OVA,8486.005043,1.0,110.0
Special,4340.200504,1.0,51.0
TV,131323.383602,2.0,1818.0


# 2. Feature Extraction

In [144]:
# 1. Encoding Categorical columns
ord_enc= OrdinalEncoder()

In [145]:
df['type'] = ord_enc.fit_transform(df[['type']])

In [146]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",0.0,1.0,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",5.0,64.0,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",5.0,51.0,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",5.0,24.0,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",5.0,51.0,9.16,151266


# 3. Recommendation System

In [147]:
df1 = df.pivot_table(index= 'name',columns='type',values='rating')
df1.fillna(0,inplace=True)
df1.head()

type,0.0,1.0,2.0,3.0,4.0,5.0
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
&quot;0&quot;,0.0,5.06,0.0,0.0,0.0,0.0
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",5.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,7.06,0.0,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,7.54,0.0,0.0
&quot;Bungaku Shoujo&quot; Movie,7.63,0.0,0.0,0.0,0.0,0.0


In [148]:
movies_cosine_sim = cosine_similarity(df1)
movies_cosine_sim.shape

(12292, 12292)

In [149]:
def recommend_anime_cosine_similarity(df, show_name, movies_cosine_sim, first = 10):
    if show_name in df.index:
        index= np.where(show_name==df.index)[0][0]
        similar = sorted(list(enumerate(movies_cosine_sim[index])),reverse=True,key=lambda x: x[1])[:first + 1]
        print('*'*100)
        print(f'Recommended show for [{show_name}] are;')
        print('*'*100)
        for i, show in enumerate(similar, start = 1):
            print(i, df.index[show[0]])
        print('*'*100)
    else:
        print(f'[{show_name}] show does not exist.')

In [150]:
df["name"].unique()

array(['Kimi no Na wa.', 'Fullmetal Alchemist: Brotherhood', 'Gintama°',
       ..., 'Violence Gekiga David no Hoshi',
       'Violence Gekiga Shin David no Hoshi: Inma Densetsu',
       'Yasuji no Pornorama: Yacchimae!!'], shape=(12292,), dtype=object)

In [151]:
recommend_anime_cosine_similarity(df1, "Kimi no Na wa.", movies_cosine_sim, 20)

****************************************************************************************************
Recommended show for [Kimi no Na wa.] are;
****************************************************************************************************
1 &quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu
2 &quot;Bungaku Shoujo&quot; Movie
3 .hack//G.U. Trilogy
4 .hack//The Movie: Sekai no Mukou ni
5 009 Re:Cyborg
6 00:08
7 1000-nen Joou: Queen Millennia
8 1001 Nights
9 11-nin Iru!
10 12-gatsu no Uta
11 15 Sonyeon Uju Pyoryugi
12 21 Emon Uchuu e Irasshai!
13 21 Emon Uchuu ike! Hadashi no Princess
14 3 Choume no Tama: Onegai! Momo-chan wo Sagashite!!
15 3-tsu no Kumo
16 4.Eyes
17 663114
18 77Danui Bimil
19 8-gatsu no Symphony: Shibuya 2002-2003
20 84 Taekwon V
21 A.LI.CE
****************************************************************************************************


In [None]:
'''
1) Difference between user-based and item-based collaborative filtering
- User Based Collaborative Filtering
With user based collaborative filtering, similar kind of users are grouped based on the interests of their items, usually interpreted by their reviews and comments on the items. With the reviews, users having similar tastes are grouped. The data is transformed in such a way where users are listed in the index column, and the items are usually in the form of multiple columns, and the corresponding cell for a user's item, is where the rating is put. This makes it easy to interpret the data for calculating the similarities with different criterions.


- Item Based Collaborative Filtering 
With Item based collaborative filtering, items having similar attributes are grouped, like reviews, price comments, etc., Here the data is transformed in such a way where items are listed in the index column, and the users are usually in the form of multiple columns, and the corresponding cell for a item and it's buyer, is where the rating/ price/ comment is found. 


2) COllaborative Filtering
Collaborative filtering is a recommendation technique that suggests items based on user behavior patterns, not item content.  
It works by finding similar users or similar items using past interactions like ratings, clicks, or purchases.  
If users with similar preferences liked an item, it is recommended to others in that group.  
There are two main types: user-based and item-based collaborative filtering.  
It improves automatically as more user interaction data becomes available.
'''