In [1]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sb

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings = pd.read_csv("./archive/rating_complete.csv")
# ratings_reduced = ratings.sample(frac=0.01)
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


In [3]:
anime_df = pd.read_csv("./archive/anime.csv")
anime_df["anime_id"] = anime_df["MAL_ID"]
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,anime_id
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,1
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,5
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,6
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,7
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,8


In [4]:
df = pd.merge(ratings, anime_df[['anime_id','Name', 'Genres']], on="anime_id", how="inner")
# df = ratings_reduced.merge(anime_df[['Name', 'Genres']])
df

Unnamed: 0,user_id,anime_id,rating,Name,Genres
0,0,430,9,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
1,6,430,8,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
2,18,430,10,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
3,19,430,8,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
4,33,430,4,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
...,...,...,...,...,...
57633273,315549,38853,1,Ex-Arm,"Action, Sci-Fi, Ecchi, Seinen"
57633274,350024,38853,10,Ex-Arm,"Action, Sci-Fi, Ecchi, Seinen"
57633275,311855,39435,6,Oshiri Tantei Movie 1: Curry Naru Jiken,"Mystery, Comedy, Kids, Fantasy"
57633276,334473,35746,4,6 Lovers,"Comedy, Drama, Romance, School, Shounen Ai, Sl..."


In [5]:
# only users with > 1000 ratings
df_reduced = df.groupby("user_id").filter(lambda x: len(x) > 1000)
df_reduced
# df_reduced.sort_values(by="user_id")

Unnamed: 0,user_id,anime_id,rating,Name,Genres
42,326,430,9,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
61,446,430,7,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
121,853,430,8,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
124,890,430,6,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
133,943,430,7,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
...,...,...,...,...,...
57633267,283786,45448,6,Dou Hun Wei Zhi Xuan Yue Qiyuan IV,"Action, Fantasy"
57633268,283786,42465,6,Guan Hai Ce Zhi Tie Qi Xiong Guan,"Action, Military, Historical, Martial Arts, Fa..."
57633269,289506,41277,4,Rainbow Ruby,"Kids, Fantasy"
57633271,308285,40389,8,Uchuu Senkan Yamato 2205: Aratanaru Tabidachi,"Action, Military, Sci-Fi, Space, Drama"


In [6]:
print(df_reduced["user_id"].value_counts())
print(df_reduced["Name"].value_counts())

189037    15455
162615    14864
68042     13462
283786    12778
259790     9996
          ...  
306112     1001
106903     1001
157929     1001
247568     1001
198099     1001
Name: user_id, Length: 4447, dtype: int64
Angel Beats!                                               4078
Sword Art Online                                           4028
Shingeki no Kyojin                                         4011
No Game No Life                                            3937
Toradora!                                                  3921
                                                           ... 
Jian Wang 3: Xia Gan Yi Dan Shen Jianxin Zhi Chang Piao       1
Zetsumetsu Kigu-shun.                                         1
Kaiketsu Zorori no Kyoufu no Hanayome Sakusen                 1
Bite-Choicar                                                  1
Nan-chan no Koutsuu Anzen                                     1
Name: Name, Length: 16827, dtype: int64


In [7]:
# only animes with > 1000 ratings
df_reduced = df_reduced.groupby("Name").filter(lambda x: len(x) > 100)
df_reduced

Unnamed: 0,user_id,anime_id,rating,Name,Genres
42,326,430,9,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
61,446,430,7,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
121,853,430,8,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
124,890,430,6,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
133,943,430,7,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
...,...,...,...,...,...
57589782,351361,3175,5,Kaitei Choutokkyuu: Marine Express,"Action, Adventure, Comedy, Sci-Fi, Drama"
57589783,351801,3175,6,Kaitei Choutokkyuu: Marine Express,"Action, Adventure, Comedy, Sci-Fi, Drama"
57589784,352832,3175,10,Kaitei Choutokkyuu: Marine Express,"Action, Adventure, Comedy, Sci-Fi, Drama"
57589785,352922,3175,2,Kaitei Choutokkyuu: Marine Express,"Action, Adventure, Comedy, Sci-Fi, Drama"


In [8]:
print(df_reduced["user_id"].value_counts())
print(df_reduced["Name"].value_counts())

189037    8767
162615    8400
68042     8301
291207    7774
283786    7616
          ... 
63105      809
289648     809
83662      806
66724      803
345858     559
Name: user_id, Length: 4447, dtype: int64
Angel Beats!                                                        4078
Sword Art Online                                                    4028
Shingeki no Kyojin                                                  4011
No Game No Life                                                     3937
Toradora!                                                           3921
                                                                    ... 
Hakubutsushi                                                         101
Sayokyoku                                                            101
Flutter of Birds II: Tenshi-tachi no Tsubasa                         101
Kujira (1952)                                                        101
Tottoko Hamtarou Movie 2: Ham-Ham Hamuuja! Maboroshi no Princes

In [9]:
df_reduced = df_reduced.sample(frac=0.1)
df_reduced

Unnamed: 0,user_id,anime_id,rating,Name,Genres
42298664,24795,9063,5,Toaru Kagaku no Railgun: Entenka no Satsuei Mo...,"Comedy, Ecchi"
33216260,330552,24833,10,Ansatsu Kyoushitsu,"Action, Comedy, School, Shounen"
34306085,148335,12679,10,Joshiraku,"Slice of Life, Comedy, Shounen"
46723917,189037,10810,7,Pucca 2,"Action, Comedy, Parody, Romance"
1438952,78558,40221,6,Kami no Tou,"Action, Adventure, Mystery, Drama, Fantasy"
...,...,...,...,...,...
38633146,150528,53,5,Ai Yori Aoshi,"Harem, Slice of Life, Comedy, Drama, Romance"
54512715,53730,41207,7,"Tenioha! 2: Limit Over - Mada Mada Ippai, Ecch...",Hentai
11522721,168120,10611,3,R-15,"Comedy, Ecchi, Harem, Romance, School"
45479348,308346,8740,7,One Piece Film: Strong World Episode 0,"Action, Adventure, Comedy, Fantasy, Shounen, S..."


In [10]:
# matrix = df_reduced.pivot_table(index="Name", columns="user_id", values='rating')
# matrix.head()

In [11]:
# print(matrix[353385].unique())
# # print(matrix[353387].unique())
# print(matrix[353398].unique())
# print(matrix[11].unique())

In [12]:
# Normalize matrix
# matrix_norm = matrix.subtract(matrix.mean(axis=1), axis='rows')
# matrix_norm.head()

In [13]:
# Pearson correlation
# user_sim = matrix_norm.T.corr()
# user_sim

In [14]:
# # Compute the user-item matrix
# user_item_matrix = df.pivot(index='user_id', columns='Name', values='rating')

# # Compute the Pearson correlation between users
# user_item_matrix = user_item_matrix.subtract(user_item_matrix.mean(axis=1), axis='rows')
# print(user_item_matrix)

In [15]:
user_item_matrix = df_reduced.pivot(index='user_id', columns='Name', values='rating')

# Compute the Pearson correlation between users
user_item_matrix = user_item_matrix.subtract(user_item_matrix.mean(axis=1), axis='rows')

user_corr = user_item_matrix.T.corr()

In [85]:
user_corr

user_id,326,446,478,781,853,890,912,943,985,1177,...,352660,352669,352761,352832,352835,352922,352924,352930,353304,353325
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
326,1.000000,0.174078,0.000000,-0.192961,0.419314,-0.210732,0.235702,-0.520416,0.324967,0.592014,...,0.740959,0.659912,-4.532467e-17,4.504401e-17,0.944911,-1.0,-0.674200,0.540062,-0.866025,-0.058926
446,0.174078,1.000000,0.221163,,,0.596559,,,-0.522233,1.000000,...,0.774597,0.476731,4.166667e-01,,0.405999,,0.646997,0.810093,0.688247,
478,0.000000,0.221163,1.000000,0.789076,-0.965909,-0.191462,-1.000000,0.556030,-1.000000,0.539951,...,,-0.200446,-7.071068e-01,-6.808829e-01,0.666667,1.0,-0.054554,0.285056,0.676716,0.523723
781,-0.192961,,0.789076,1.000000,0.132967,-0.600397,0.339121,-0.719569,0.575224,0.746091,...,0.975980,,4.622502e-02,-1.456929e-01,0.812158,-1.0,,0.000000,0.821995,-0.359211
853,0.419314,,-0.965909,0.132967,1.000000,0.362738,,,-0.055863,-0.345547,...,,-0.041703,9.597599e-01,7.205767e-01,-0.103807,,-0.332650,0.324937,,-1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352922,-1.000000,,1.000000,-1.000000,,0.717532,,,,,...,,0.831522,9.971765e-01,,,1.0,0.500000,,-0.500000,
352924,-0.674200,0.646997,-0.054554,,-0.332650,0.710431,-0.082199,,0.777778,0.555556,...,,0.211289,9.538210e-01,9.819805e-01,-0.648886,0.5,1.000000,0.174078,0.468521,
352930,0.540062,0.810093,0.285056,0.000000,0.324937,-1.000000,,,,0.389249,...,0.755929,0.801784,7.307981e-01,5.833333e-01,1.000000,,0.174078,1.000000,-0.707107,0.316228
353304,-0.866025,0.688247,0.676716,0.821995,,0.728705,,,-0.547723,0.944911,...,0.804030,,3.144855e-01,,0.143088,-0.5,0.468521,-0.707107,1.000000,


In [20]:
# Sort the correlations in descending order and select the top n similar users
# similar_users = user_corr[24795].sort_values(ascending=False)[1:10+1].index.tolist()

In [68]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import operator

def recommend_animes(df, user_id, n_similar_users, n_animes_to_recommend):
    
    
    # Sort the correlations in descending order and select the top n similar users
    similar_users = user_corr[user_id].sort_values(ascending=False)[1:n_similar_users+1].index.tolist()
    # print("SIMILAR  - \n", similar_users, "\n-------------------\n")
    # Compute the weighted average of the ratings for each anime using the similar users' ratings
    anime_ratings = {}
    for anime_id in user_item_matrix.columns:
        # print("ANIME - " , anime_id, "\n")
        total = 0
        count = 0

        # print(user_item_matrix.loc[user_id, anime_id])
        if user_item_matrix.loc[user_id, anime_id] == np.nan:
            anime_ratings[anime_id] = np.nan
        else:
            
            for user in similar_users:
                if( not pd.isna(user_item_matrix[anime_id][user])):
                    total += user_item_matrix[anime_id][user] * user_corr[user][user_id]
                    count +=1
            if(count != 0):
                anime_ratings[anime_id] = total/count
                          
            # print(user_item_matrix.loc[similar_users, anime_id].dropna())
            # print("----------------------------------\n")
            # print(user_corr[similar_users])
            # anime_ratings[anime_id] = np.average(user_item_matrix.loc[similar_users, anime_id].dropna(), 
                                            #  weights=user_corr[similar_users])
            # anime_ratings.append(np.average(user_item_matrix.loc[similar_users, anime_id].dropna(), 
            #                                  weights=user_corr[similar_users]))
    
    # Convert the anime_ratings list into a Pandas Series
    # anime_ratings = pd.Series(anime_ratings, index=user_item_matrix.columns)
    
    # Sort the anime ratings in descending order and select the top n animes to recommend
    top_animes = sorted(anime_ratings.items(), key=operator.itemgetter(1), reverse=True)[:n_animes_to_recommend]
    # anime_ratings.sort_values(ascending=False)[:n_animes_to_recommend]
    
    # Return the top animes with their predicted ratings
    return top_animes


In [81]:
# Load the data into a Pandas DataFrame
df = df_reduced

# Call the recommend_animes function to get the top recommended animes for a given user
user_id = 283786 # 24795
n_similar_users = 1
n_animes_to_recommend = 20
top_animes = recommend_animes(df, user_id, n_similar_users, n_animes_to_recommend)

# Print the top recommended animes with their predicted ratings
top_animes


[('Amagami SS', 3.0884955752212386),
 ('Ane Yome Quartet', 3.0884955752212386),
 ('Baku Ane 2: Otouto Ippai Shibocchau zo! The Animation', 3.0884955752212386),
 ('Kami nomi zo Shiru Sekai II', 3.0884955752212386),
 ('Mahouka Koukou no Rettousei', 3.0884955752212386),
 ('Mashiro-iro Symphony: The Color of Lovers', 3.0884955752212386),
 ('Muttsuri Dosukebe Tsuyu Gibo Shimai no Honshitsu Minuite Sex Sanmai',
  3.0884955752212386),
 ('Sweet Home: H na Oneesan wa Suki Desu ka?', 3.0884955752212386),
 ('Chichiiro Toiki', 2.0884955752212386),
 ('Chrome Shelled Regios', 2.0884955752212386),
 ('Cool Devices', 2.0884955752212386),
 ('Inma Youjo', 2.0884955752212386),
 ('Jewelry The Animation', 2.0884955752212386),
 ('Koiito Kinenbi The Animation', 2.0884955752212386),
 ('Kyonyuu Princess Saimin', 2.0884955752212386),
 ('Oni Chichi: Re-birth', 2.0884955752212386),
 ('Residence', 2.0884955752212386),
 ('Serial Experiments Lain', 2.0884955752212386),
 ('Ane Koi: Suki Kirai Daisuki.', 1.088495575221

In [84]:
user_item_matrix["Amagami SS"].value_counts()

 1.119658    2
 0.192771    1
 2.980769    1
 0.481481    1
-0.527132    1
            ..
 1.317518    1
 0.367470    1
-0.897436    1
-1.321101    1
 0.823009    1
Name: Amagami SS, Length: 251, dtype: int64