In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

import re 
#for renaming the titles

from sklearn.feature_extraction.text import TfidfVectorizer
#for converting text into token counts

from sklearn.metrics.pairwise import cosine_similarity
#for calculating the similarity score

In [2]:
anime_data = pd.read_csv('anime.csv')
anime_data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
anime_data.shape

(12294, 7)

In [4]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
anime_rating = pd.read_csv('rating.csv')
anime_rating.head(10)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
5,1,355,-1
6,1,356,-1
7,1,442,-1
8,1,487,-1
9,1,846,-1


In [6]:
anime_rating.shape

(7813737, 3)

In [7]:
anime_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


**modify**

In [8]:
anime_fulldata = pd.merge(anime_data,anime_rating,on = 'anime_id')
anime_fulldata = anime_fulldata.rename(columns={'name': 'title','rating_x':'avg_rating','rating_y':'user_rating'})
anime_fulldata.head()

Unnamed: 0,anime_id,title,genre,type,episodes,avg_rating,members,user_id,user_rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,278,-1


In [9]:
#renaming title
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'[;!!:-]','',text)
    text = re.sub(r'°','', text)
    
    return text

anime_fulldata['title'] = anime_fulldata['title'].apply(text_cleaning)

In [10]:
anime_fulldata['title'].unique()

array(['Kimi no Na wa.', 'Fullmetal Alchemist Brotherhood', 'Gintama',
       ..., 'Violence Gekiga David no Hoshi',
       'Violence Gekiga Shin David no Hoshi Inma Densetsu',
       'Yasuji no Pornorama Yacchimae'], dtype=object)

In [11]:
#dropping anime rating < 5
anime_fuldata = anime_fulldata.drop(anime_fulldata[(anime_fulldata['user_rating'] < 5)].index,inplace=True)

In [12]:
anime_fulldata.shape

(6151696, 9)

In [13]:
anime_fulldata['episodes'].unique()

array(['1', '64', '51', '24', '10', '148', '110', '13', '201', '25', '22',
       '75', '4', '26', '12', '27', '43', '74', '37', '2', '11', '99',
       '39', '101', '47', '50', '62', '33', '112', '23', '3', '94', '8',
       '14', '7', '40', '15', '203', '77', '291', '6', '102', '96', '38',
       '79', '175', '103', '70', '153', '45', '5', '21', '63', '52', '28',
       '145', '36', '69', '60', '178', '114', '35', '61', '34', '109',
       '20', '9', '49', '366', '97', '48', '78', '358', '155', '104',
       '113', '54', '167', '161', '42', '142', '31', '373', '220', '46',
       '195', '17', '1787', '73', '147', '127', '16', '19', '98', '150',
       '76', '53', '124', '29', '115', '224', '44', '58', '93', '154',
       '92', '67', '172', '86', '30', '276', '59', '72', '330', '41',
       '105', '128', '137', '56', '55', '65', '243', '193', '18', '191',
       '180', '91', '192', '66', '182', '32', '164', '100', '296', '694',
       '95', '68', '117', '151', '130', '87', '170', '119

In [14]:
anime_fulldata['episodes'].replace({'unknown':np.nan},inplace=True)

In [15]:
anime_fulldata.isnull().sum()

anime_id        0
title           0
genre          41
type            4
episodes        0
avg_rating      5
members         0
user_id         0
user_rating     0
dtype: int64

In [16]:
anime_fulldata.dropna(inplace=True)
anime_fulldata.isnull().sum()

anime_id       0
title          0
genre          0
type           0
episodes       0
avg_rating     0
members        0
user_id        0
user_rating    0
dtype: int64

In [17]:
duplicate_data = anime_fulldata.duplicated(subset=['anime_id','user_id']).sum()
print('There are {} duplicated rows in the data'.format(duplicate_data))

There are 7 duplicated rows in the data


In [22]:
anime_fulldata.drop_duplicates(subset=['anime_id','user_id'],inplace =True)


In [23]:
duplicate_data = anime_fulldata.duplicated(subset=['anime_id','user_id']).sum()
print('There are {} duplicated rows in the data'.format(duplicate_data))

There are 0 duplicated rows in the data


In [24]:
anime_fulldata.shape

(6151643, 9)

In [20]:
anime_fulldata.reset_index(drop = True,inplace=True)
anime_fulldata.tail()

Unnamed: 0,anime_id,title,genre,type,episodes,avg_rating,members,user_id,user_rating
6151638,10368,Teleclub no Himitsu,Hentai,OVA,2,4.67,148,65836,5
6151639,9352,Tenshi no Habataki Jun,Hentai,OVA,1,4.33,201,53698,6
6151640,9352,Tenshi no Habataki Jun,Hentai,OVA,1,4.33,201,60365,7
6151641,9316,Toushindai My Lover Minami tai MechaMinami,Hentai,OVA,1,4.15,211,20171,7
6151642,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219,49503,6


**statistics**

In [21]:
pd.options.display.float_format = '{:.2f}'.format #to print the number as a whole instead of having +/-(value) at the end
anime_fulldata.describe()

Unnamed: 0,anime_id,avg_rating,members,user_id,user_rating
count,6151643.0,6151643.0,6151643.0,6151643.0,6151643.0
mean,8911.27,7.7,186684.38,36766.9,7.95
std,8885.54,0.65,191628.9,21016.96,1.37
min,1.0,2.0,37.0,1.0,5.0
25%,1238.0,7.32,47882.0,19001.0,7.0
50%,6213.0,7.71,120351.0,36852.0,8.0
75%,14131.0,8.17,258914.0,54889.0,9.0
max,34475.0,9.37,1013917.0,73516.0,10.0


In [25]:
anime_fulldata['anime_id'] = anime_fulldata['anime_id'].astype(object)
anime_fulldata['user_id'] = anime_fulldata['user_id'].astype(object)
anime_fulldata.describe()

Unnamed: 0,avg_rating,members,user_rating
count,6151643.0,6151643.0,6151643.0
mean,7.7,186684.38,7.95
std,0.65,191628.9,1.37
min,2.0,37.0,5.0
25%,7.32,47882.0,7.0
50%,7.71,120351.0,8.0
75%,8.17,258914.0,9.0
max,9.37,1013917.0,10.0


In [None]:
#pd.plotting.scatter_matrix(anime_fulldata,figsize=(10,13));

**visualization**

**cosine similarity**

In [27]:
#define count vectorizer
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), 
                        min_df=0,stop_words='english')

#replace nan values with ""
anime_fulldata['genre'] = anime_fulldata['genre'].str.split(',').astype(str)

tfidf_matrix = tfidf.fit_transform(anime_fulldata['genre'])

tfidf_matrix.shape

KeyboardInterrupt: 

In [None]:
#computing cosine similarity matrix
cos_similar = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_similar.shape

In [None]:
#getting the indices of anime title
indices = pd.Series(anime_fulldata.index,index=anime_fulldata['title']).drop_duplicates()
indices[:10]

In [None]:
def anime_rec(title,cos_similar = cos_similar):
    #get the index of anime that matches the title
    anime_idx = indices[title]
    
    #put the pairwise similarity score in a list
    similar_score = list(enumerate(cos_similar[anime_idx]))
    
    #sort based on similar scores
    similar_score = sorted(similar_score, key=lambda x: x[1],reverse=True)
    
    #get the top 10 similar scores
    similar_score = similar_score[1:11]
    
    #get anime indices
    anime_indices = [i[0] for i in similar_score]
    
    #return top 10 similarity scores
    return pd.DataFrame({'Anime title': anime_fulldata['title'].iloc[anime_indices],
                                 'Rating': anime_fulldata['avg_rating'].iloc[anime_indices]})

#(anime_data['title'].iloc[anime_indices])

In [None]:
anime_rec('haikyuu')

In [None]:
anime_rec('gintama')

In [None]:
anime_rec('toradora')