## reference
https://www.kaggle.com/ajmichelutti/collaborative-filtering-on-anime-data

In [1]:
import pandas as pd 
import numpy as np 
import scipy as sp 
from sklearn.metrics.pairwise import cosine_similarity 
import operator

In [2]:
anime  = pd.read_csv('../machine_learning/data/anime.csv')
rating = pd.read_csv('../machine_learning/data/rating.csv')

In [3]:
rating['rating'].replace({-1:np.nan}, inplace=True)
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [4]:
anime['type'].unique()

array(['Movie', 'TV', 'OVA', 'Special', 'Music', 'ONA', nan], dtype=object)

In [5]:
anime_tv = anime[anime['type']=='TV']
anime_tv.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351


In [6]:
merged = rating.merge(anime_tv, left_on='anime_id', right_on='anime_id', suffixes={'_user', ''})
merged.rename(columns={'rating_user': 'user_rating'}, inplace=True)
merged.head()

Unnamed: 0,user_id,anime_id,user_rating,name,genre,type,episodes,rating,members
0,1,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,20,8.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
2,5,20,6.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,6,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
4,10,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297


In [7]:
merged = merged[['user_id', 'name', 'user_rating']]
merged_sub = merged[merged['user_id'] <= 10000]
merged_sub.head()

Unnamed: 0,user_id,name,user_rating
0,1,Naruto,
1,3,Naruto,8.0
2,5,Naruto,6.0
3,6,Naruto,
4,10,Naruto,


In [8]:
piv = merged_sub.pivot_table(index=['user_id'], columns=['name'], values='user_rating')
print(piv.shape)
piv.head()

(9387, 2708)


name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,2.0,
7,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# standardize
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]
piv_norm.head()

user_id,3,5,7,8,10,11,12,14,16,17,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.519231,0.0,0.0,0.0


In [9]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

In [10]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [11]:
item_sim_df = pd.DataFrame(item_similarity, index=piv_norm.index, columns=piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index=piv_norm.columns, columns=piv_norm.columns)

In [13]:
item_sim_df.head()

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,1.0,0.483043,0.436756,0.035721,0.070056,0.122621,0.004887,0.0,0.05161,-0.025314,...,0.065054,0.0,-0.113007,-0.108576,0.0,0.0,0.065426,0.132925,-0.135748,-0.115742
.hack//Sign,0.483043,1.0,0.441474,0.035239,0.089239,0.093153,0.003677,0.0,0.059214,-0.023532,...,0.076632,0.0,-0.111885,-0.122537,0.011767,0.010797,0.048033,0.155897,-0.137902,-0.118404
.hack//Tasogare no Udewa Densetsu,0.436756,0.441474,1.0,0.064496,0.057389,0.092265,0.005965,0.0,0.043266,-0.021194,...,0.030686,0.0,-0.119785,-0.129481,0.0,0.0,0.070611,0.104973,-0.133673,-0.101508
009-1,0.035721,0.035239,0.064496,1.0,0.012034,0.061638,0.011157,0.0,0.002477,-0.023557,...,0.00897,0.0,-0.030307,-0.031157,0.0,0.0,0.013543,0.012651,-0.08285,-0.05898
07-Ghost,0.070056,0.089239,0.057389,0.012034,1.0,0.164189,0.009277,0.000548,0.044063,-0.025039,...,0.009376,0.0,-0.084299,-0.080929,-0.000472,-0.000355,0.021332,0.051022,-0.097623,-0.111422


In [12]:
def top_animes(anime_name):
    count = 1
    print('Similar shows to {} include:\n'.format(anime_name))
    for item in item_sim_df.sort_values(by=anime_name, ascending=False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count += 1

## pickle

In [22]:
import pickle 

In [None]:
# with open('./top_animes.pickle', 'wb') as f:
#     pickle.dump(top_animes, f)

In [None]:
# with open('./item_sim_df.pickle', 'wb') as f:
#     pickle.dump(item_sim_df, f)

In [20]:
anime_title_df = pd.DataFrame(piv_norm.index)
anime_title_df.head()

Unnamed: 0,name
0,.hack//Roots
1,.hack//Sign
2,.hack//Tasogare no Udewa Densetsu
3,009-1
4,07-Ghost


In [31]:
anime_title_df[anime_title_df['name'].str.contains('Naruto')]['name'].tolist()

['Naruto', 'Naruto SD: Rock Lee no Seishun Full-Power Ninden']

In [23]:
# with open('./anime_title_df.pickle', 'wb') as f:
#     pickle.dump(anime_title_df, f)