# Collaborative Filtering

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import numpy as np
import re
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import sigmoid_kernel

start_time = time.time()

#### Read the anime dataset and rating dataset

In [2]:
anime_df=pd.read_csv("anime.csv")
rating_df=pd.read_csv("rating.csv")

print("Full anime dataset shape is ",anime_df.shape)
print("Full rating dataset shape is ",rating_df.shape)

Full anime dataset shape is  (12294, 7)
Full rating dataset shape is  (7813737, 3)


#### Function to clean the text by removing specific patterns

In [3]:
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    
    return text

In [4]:
anime_df.head(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


In [5]:
rating_df.head(3)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1


In [6]:
anime_df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [7]:
rating_df.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [8]:
len(rating_df['user_id'].unique()) 

73515

#### Drop unnecessary columns from the anime dataset

In [9]:
anime_df=anime_df.drop(['members','type','episodes'], axis=1)
anime_df.head()

Unnamed: 0,anime_id,name,genre,rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.25
3,9253,Steins;Gate,"Sci-Fi, Thriller",9.17
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",9.16


#### Apply text cleaning to the 'name' column in the anime dataset

In [10]:
anime_df['name'] = anime_df['name'].apply(text_cleaning)

#### Create a TF-IDF vectorizer

In [11]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

#### Fill missing genre values with empty string and convert them to string type

In [12]:
anime_df['genre'] = anime_df['genre'].fillna('')
genres_str = anime_df['genre'].str.split(',').astype(str)
tfv_matrix = tfv.fit_transform(genres_str)

#### Compute cosine similarity between TF-IDF vectors

In [13]:
cosine_sim = cosine_similarity(tfv_matrix, tfv_matrix)

#### Create indices based on anime names

In [14]:
indices = pd.Series(anime_df.index, index=anime_df['name']).drop_duplicates()

#### Function to find similar animes based on the input anime name

In [15]:
def find_similar_animes(title, cosine_sim=cosine_sim):
    """
    Finds similar animes to a given title based on cosine similarity scores.

    Parameters:
    title (str): The title of the anime for which similar animes are to be found.
    cosine_sim (numpy.ndarray, optional): The cosine similarity matrix. Defaults to cosine_sim.

    Returns:
    pandas.DataFrame: A DataFrame containing the top 5 similar animes to the given title. The DataFrame includes columns for anime name, genre, and rating.
    """

    if title not in indices:
        return "Given anime title not found in the dataset."

    idx = indices[title]
    cos_scores = list(enumerate(cosine_sim[idx]))
    cos_scores = sorted(cos_scores, key=lambda x: x[1], reverse=True)
    cos_scores = cos_scores[1:6]
    anime_indices = [i[0] for i in cos_scores]

    anime_names = anime_df['name'].iloc[anime_indices].values
    ratings = anime_df['rating'].iloc[anime_indices].values
    genre = anime_df['genre'].iloc[anime_indices].values
    result_df = pd.DataFrame({'Anime name': anime_names, 'Genre': genre, 'Rating': ratings})

    return result_df

In [16]:
find_similar_animes('Dragon Ball Z')

Unnamed: 0,Anime name,Genre,Rating
0,Dragon Ball Kai (2014),"Action, Adventure, Comedy, Fantasy, Martial Ar...",8.01
1,Dragon Ball Kai,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.95
2,Dragon Ball Z Movie 15: Fukkatsu no F,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.55
3,Dragon Ball Super,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.4
4,Dragon Ball Z: Summer Vacation Special,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.05


In [17]:
end_time = time.time()

In [18]:
total = end_time - start_time
print("Total execution time is {:.2f} seconds".format(total))

Total execution time is 3.16 seconds
