# Collaborative Filtering

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
import re
import time
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances

start_time = time.time()

#### Read the anime dataset and rating dataset

In [2]:
anime_df=pd.read_csv("anime.csv")
rating_df=pd.read_csv("rating.csv")

print("Full anime dataset shape is ",anime_df.shape)
print("Full rating dataset shape is ",rating_df.shape)

Full anime dataset shape is  (12294, 7)
Full rating dataset shape is  (7813737, 3)


#### Function to clean the text by removing specific patterns

In [3]:
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    
    return text

In [4]:
anime_df.head(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


In [5]:
rating_df.head(3)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1


In [6]:
anime_df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [7]:
rating_df.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [8]:
len(rating_df['user_id'].unique()) 

73515

#### Drop unnecessary columns from the anime dataset

In [9]:
anime_df=anime_df.drop(['members','type','episodes'], axis=1)
anime_df.head()

Unnamed: 0,anime_id,name,genre,rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.25
3,9253,Steins;Gate,"Sci-Fi, Thriller",9.17
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",9.16


#### Apply text cleaning to the 'name' column in the anime dataset

In [10]:
anime_df['name'] = anime_df['name'].apply(text_cleaning)

In [11]:
merged_data=pd.merge(anime_df,rating_df,on='anime_id',suffixes= ['', '_user'])
merged_data.isnull().sum()

anime_id         0
name             0
genre          110
rating           6
user_id          0
rating_user      0
dtype: int64

In [12]:
merged_data.head(3)

Unnamed: 0,anime_id,name,genre,rating,user_id,rating_user
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37,99,5
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37,152,10
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37,244,10


In [13]:
merged_data["rating_user"].replace({-1: np.nan}, inplace=True)
merged_data.head(3)

Unnamed: 0,anime_id,name,genre,rating,user_id,rating_user
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37,99,5.0
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37,152,10.0
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37,244,10.0


#### Filter out users who have rated less than 10 animes

In [14]:
len(merged_data.index)

7813727

In [15]:
anime_pivot=merged_data.pivot_table(index='user_id',columns='name',values='rating_user').fillna(0)
anime_pivot.head()

name,0,001,009 Re:Cyborg,009-1,009-1: RandB,00:08,07-Ghost,1+2=Paradise,100%,100-man-nen Chikyuu no Tabi: Bander Book,...,lilac (bombs Jun Togawa),makemagic,s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
anime_pivot.shape

(69600, 9921)

#### Create a sparse matrix representation of the anime pivot table

In [17]:
user_matrix = csr_matrix(anime_pivot.values)

In [18]:
user_matrix.shape

(69600, 9921)

#### Function to find animes based on the user similarity

In [19]:
def suggest_new_anime(user_id):
    """
    Suggests new anime recommendations for a given user based on their similarity to other users.

    Parameters:
    user_id (int): The ID of the user for whom anime recommendations are to be generated.

    Returns:
    pandas.DataFrame: A DataFrame containing the top 5 anime recommendations for the user. The DataFrame includes columns for anime title, genre, and rating.
    """
    try:
        user_index = anime_pivot.index.get_loc(user_id)
    except KeyError:
        print("User not found. Please try again.")
        return
    
    
    user_row = user_matrix[user_index]    
        
    cosine_sim = cosine_similarity(user_row,user_matrix)
    cos_scores = list(enumerate(cosine_sim[0]))
    cos_scores = [score for score in cos_scores if score[0] != user_index]
    cos_scores = sorted(cos_scores, key=lambda x: x[1], reverse=True)
    
    cos_indices = [i[0] for i in cos_scores]    
    similar_user_ids = [cos_indices[i] for i in range(len(cos_indices)) if cos_scores[i][1] > 0.60]
    unseen_anime_names = list(anime_pivot.columns[(anime_pivot.loc[user_id] == 0) & (anime_pivot.iloc[similar_user_ids].sum(axis=0) > 0)])
    
    unseen_anime_df = anime_df[['name', 'genre', 'rating']].loc[anime_df['name'].isin(unseen_anime_names)]
    cos_df = pd.DataFrame({'Anime Title': unseen_anime_df['name'],
                           'Genre': unseen_anime_df['genre'],
                           'Rating': unseen_anime_df['rating']})
    return cos_df[:5]

In [20]:
cos_df=suggest_new_anime(1)
cos_df

Unnamed: 0,Anime Title,Genre,Rating
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26
3,Steins;Gate,"Sci-Fi, Thriller",9.17
30,Great Teacher Onizuka,"Comedy, Drama, School, Shounen, Slice of Life",8.77
141,Nanatsu no Taizai,"Action, Adventure, Ecchi, Fantasy, Shounen, Su...",8.42
159,Angel Beats!,"Action, Comedy, Drama, School, Supernatural",8.39


In [21]:
end_time = time.time()

In [22]:
total = end_time - start_time
print("Total execution time is {:.2f} seconds".format(total))

Total execution time is 65.37 seconds
