In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

#### The Kaggle dataset : https://www.kaggle.com/datasets/CooperUnion/anime-recommendations-database?resource=download

In [2]:
anime = pd.read_csv("anime.csv")
rating = pd.read_csv("rating.csv")

In [3]:
from ydata_profiling import ProfileReport
pdreport=anime.profile_report()
pdreport.to_file("Pandas_profiling_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
anime.shape

(12294, 7)

In [6]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [7]:
rating.shape

(7813737, 3)

### Data Cleaning

In [8]:
#-1 rating means, the user watched but did not leave a review
rating = rating[rating.rating != -1]
rating.head(10)

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10
156,3,20,8
157,3,154,6
158,3,170,9
159,3,199,10
160,3,225,9


In [9]:
rating.shape

(6337241, 3)

In [10]:
anime.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [11]:
#some episodes have the value "unknown", lets replace it with -1
anime["episodes"] = anime["episodes"].replace("Unknown","-1")

In [12]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [13]:
anime["episodes"]= pd.to_numeric(anime["episodes"], errors='coerce')

In [14]:
anime.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes      int64
rating      float64
members       int64
dtype: object

In [15]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [16]:
rating.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

 Replacing the missing rating value in anime data frame with the mean of the anime using rating dataframe

In [17]:
rating_count= rating.groupby('anime_id')['rating'].agg(['count','mean'])

In [18]:
anime.rating.fillna(rating_count.mean, inplace=True)

In [19]:
anime["rating"]=pd.to_numeric(anime["rating"], errors='coerce')

In [20]:
#dropping the rest of the null values
anime.dropna(inplace= True)

In [21]:
anime.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [22]:
anime.duplicated().sum()

0

In [23]:
rating.duplicated().sum()

1

In [24]:
#removing duplicate rows
rating.drop_duplicates(inplace=True)

In [25]:
rating.duplicated().sum()

0

In [26]:
anime.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes      int64
rating      float64
members       int64
dtype: object

In [27]:
anime["rating"]=pd.to_numeric(anime["rating"], errors='coerce')

In [28]:
anime.describe()

Unnamed: 0,anime_id,episodes,rating,members
count,12017.0,12017.0,12017.0,12017.0
mean,13638.001165,12.276858,6.478264,18348.88
std,11231.076675,46.759025,1.023857,55372.5
min,1.0,-1.0,1.67,12.0
25%,3391.0,1.0,5.89,225.0
50%,9959.0,2.0,6.57,1552.0
75%,23729.0,12.0,7.18,9588.0
max,34519.0,1818.0,10.0,1013917.0


In [29]:
anime.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

### Data Visualization

In [30]:
import matplotlib.pyplot as plt 

In [31]:
no_types = dict(anime["type"].value_counts())

In [32]:
no_types

{'TV': 3668,
 'OVA': 3284,
 'Movie': 2259,
 'Special': 1670,
 'ONA': 648,
 'Music': 488}

In [33]:
fig = plt.figure(figsize = (10, 5))
 
# creating the bar plot
plt.bar(no_types.keys(), no_types.values(), color ='maroon', 
        width = 0.4)

plt.xlabel("Different types of platform")
plt.ylabel("No. of Anime")
plt.title("Anime releases in different platform")
plt.show()

In [34]:
merged_df1 = pd.merge(anime,rating,on='anime_id')

In [35]:
merged_df1.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,322,10
5,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,398,10
6,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,462,8
7,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,490,10
8,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,548,10
9,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,570,10


In [36]:
merged_df1 = merged_df1.rename(columns ={"rating_x":"anime_rating","rating_y":"user_rating"})

In [37]:
rating_count= rating.groupby('anime_id')['rating'].agg(['count','mean'])

In [38]:
merged_df = pd.merge(anime,rating_count,on='anime_id')

In [39]:
merged_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,count,mean
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,1961,9.426313
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,21494,9.322741
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,1188,9.449495
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,17151,9.261326
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,3115,9.272552
...,...,...,...,...,...,...,...,...,...
9887,5541,The Satisfaction,Hentai,OVA,1,4.37,166,2,1.000000
9888,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211,2,4.000000
9889,5543,Under World,Hentai,OVA,1,4.28,183,2,2.500000
9890,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219,1,6.000000


In [40]:
merged_df.rename(columns={"count":"voted_count","mean":"avg_rating"})

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,voted_count,avg_rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,1961,9.426313
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,21494,9.322741
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,1188,9.449495
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,17151,9.261326
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,3115,9.272552
...,...,...,...,...,...,...,...,...,...
9887,5541,The Satisfaction,Hentai,OVA,1,4.37,166,2,1.000000
9888,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211,2,4.000000
9889,5543,Under World,Hentai,OVA,1,4.28,183,2,2.500000
9890,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219,1,6.000000


In [41]:
anime_user_matrix = merged_df1.pivot_table(index='user_id',columns='name',values='user_rating')
anime_user_matrix.head(10)

name,&quot;0&quot;,"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,&quot;Eiji&quot;,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,...,lilac (bombs Jun Togawa),makemagic,s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,2.0,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,


In [42]:
rating['num of ratings'] = pd.DataFrame(merged_df1.groupby('anime_id')['user_rating'].count())
rating.sort_values('num of ratings',ascending=False).head(10)

Unnamed: 0,user_id,anime_id,rating,num of ratings
1535,14,9724,8,34226.0
11757,150,9919,10,26310.0
16498,210,8532,10,25289.0
1575,14,16762,7,24126.0
6547,73,1535,8,23565.0
226,3,22199,8,23528.0
5114,52,20651,8,21494.0
2904,29,11209,3,21124.0
4224,43,30831,6,20837.0
8074,98,8876,9,19488.0


### using corelation

In [43]:
selected_ratings = anime_user_matrix['Gintama°']

In [44]:
selected_ratings

user_id
1       NaN
2       NaN
3       NaN
5       NaN
7       NaN
         ..
73512   NaN
73513   NaN
73514   NaN
73515   NaN
73516   NaN
Name: Gintama°, Length: 69600, dtype: float64

In [45]:
similar_to_anime = anime_user_matrix.corrwith(selected_ratings)

In [46]:
corr_selected = pd.DataFrame(similar_to_anime,columns=['Correlation'])
corr_selected.dropna(inplace=True)
corr_selected.head(10)

Unnamed: 0_level_0,Correlation
name,Unnamed: 1_level_1
&quot;0&quot;,0.628971
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.329194
&quot;Bungaku Shoujo&quot; Memoire,0.389977
&quot;Bungaku Shoujo&quot; Movie,0.254738
.hack//G.U. Returner,0.093956
.hack//G.U. Trilogy,0.290026
.hack//G.U. Trilogy: Parody Mode,0.27443
.hack//Gift,0.240938
.hack//Intermezzo,0.355211
.hack//Liminality,0.248222


In [47]:
unique_name = merged_df1["name"].unique()

In [48]:
len(unique_name)

9892

In [49]:
unique_name[:30]

array(['Kimi no Na wa.', 'Fullmetal Alchemist: Brotherhood', 'Gintama°',
       'Steins;Gate', 'Gintama&#039;',
       'Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou',
       'Hunter x Hunter (2011)', 'Ginga Eiyuu Densetsu',
       'Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare',
       'Gintama&#039;: Enchousen', 'Clannad: After Story',
       'Koe no Katachi', 'Gintama', 'Code Geass: Hangyaku no Lelouch R2',
       'Haikyuu!! Second Season', 'Sen to Chihiro no Kamikakushi',
       'Shigatsu wa Kimi no Uso', 'Mushishi Zoku Shou 2nd Season',
       'Ookami Kodomo no Ame to Yuki', 'Code Geass: Hangyaku no Lelouch',
       'Hajime no Ippo',
       'Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen',
       'Cowboy Bebop', 'One Punch Man', 'Mononoke Hime',
       'Suzumiya Haruhi no Shoushitsu',
       'Monogatari Series: Second Season', 'Mushishi Zoku Shou',
       'Mushishi', 'Tengen Toppa Gurren Lagann'], dtype=object)

In [59]:
def Recommendation_corr():
        anime_name = input("Enter an anime name from the List: ")
        selected_ratings = anime_user_matrix[anime_name]
        similar_to_anime = anime_user_matrix.corrwith(selected_ratings)
        corr_selected = pd.DataFrame(similar_to_anime,columns=['Correlation'])
        corr_selected.dropna(inplace=True)
        print(corr_selected.nlargest(10, ['Correlation']))

Recommendation_corr()

Enter an anime name from the List: Fullmetal Alchemist: Brotherhood
                              Correlation
name                                     
2010                                  1.0
Ai no Gakko Cuore Monogatari          1.0
Akaoni to Aooni no Tango              1.0
Andersen Monogatari (TV)              1.0
Aruite Mikka!                         1.0
Ashita Tenki ni Naare                 1.0
Awake                                 1.0
Backkom: Jung-gug Gijeogwi            1.0
Balloon                               1.0
Batsu &amp; Terry                     1.0


### Using KNN

In [51]:
#using knn
from scipy.sparse import csr_matrix
user_rating = merged_df1.drop_duplicates(['user_id', 'name'])
user_rating_pivot = user_rating.pivot(index = 'name', columns = 'user_id', values = 'user_rating').fillna(0)
user_rating_matrix = csr_matrix(user_rating_pivot.values)

In [52]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(user_rating_matrix)

In [53]:
query_index = np.random.choice(user_rating_pivot.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(user_rating_pivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

5429


In [54]:
user_rating_pivot.index[query_index]

'Maro no Kanja wa Gatenkei'

In [55]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Maro no Kanja wa Gatenkei:

1: Momoiro Milk, with distance of 0.37500307673373934:
2: Busou Shoujotai: Blade Briders The Animation, with distance of 0.3946279606876739:
3: Netoraserare, with distance of 0.4152916311328092:
4: Yakata: Kannou Kitan, with distance of 0.4204799521730609:
5: Aniki no Yome-san nara, Ore ni Hamerarete Hiihii Itteru Tokoro Da yo, with distance of 0.4298360021321186:


In [56]:
user_rating_pivot.index

Index(['&quot;0&quot;',
       '&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu',
       '&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi',
       '&quot;Bungaku Shoujo&quot; Memoire',
       '&quot;Bungaku Shoujo&quot; Movie', '&quot;Eiji&quot;',
       '.hack//G.U. Returner', '.hack//G.U. Trilogy',
       '.hack//G.U. Trilogy: Parody Mode', '.hack//Gift',
       ...
       'lilac (bombs Jun Togawa)', 'makemagic', 's.CRY.ed', 'vivi', 'xxxHOLiC',
       'xxxHOLiC Kei', 'xxxHOLiC Movie: Manatsu no Yoru no Yume',
       'xxxHOLiC Rou', 'xxxHOLiC Shunmuki', '◯'],
      dtype='object', name='name', length=9892)

In [57]:
def Recommendation_knn():
        anime_name = input("Enter an anime name from the List: ")
        query_index = user_rating_pivot.index == anime_name
        distances, indices = model_knn.kneighbors(user_rating_pivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
        for i in range(0, len(distances.flatten())):
            if i == 0:
                print('Recommendations for {0}:\n'.format(user_rating_pivot.index[query_index]))
            else:
                print('{0}: {1}, with distance of {2}:'.format(i, user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))
                
Recommendation_knn()

Enter an anime name from the List: Fullmetal Alchemist: Brotherhood
Recommendations for Index(['Fullmetal Alchemist: Brotherhood'], dtype='object', name='name'):

1: Fullmetal Alchemist, with distance of 0.42110457922379607:
2: Death Note, with distance of 0.43496917577512295:
3: Shingeki no Kyojin, with distance of 0.4435747187042025:
4: Code Geass: Hangyaku no Lelouch, with distance of 0.4467318707575568:
5: Code Geass: Hangyaku no Lelouch R2, with distance of 0.4517881031668245:
