#Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
anime = pd.read_csv('anime.csv')
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [4]:
anime.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [5]:
anime.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [6]:
anime.fillna(anime.mean(numeric_only=True).round(1), inplace=True)

In [7]:
anime.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,0
members,0


In [8]:
anime.drop_duplicates(inplace=True)

In [9]:
anime.fillna('',inplace=True)

In [10]:
anime.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,0
members,0


In [11]:
anime.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12294.0,12294.0
mean,14058.221653,6.47439,18071.34
std,11455.294701,1.017102,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.9,225.0
50%,10260.5,6.55,1550.0
75%,24794.5,7.17,9437.0
max,34527.0,10.0,1013917.0


In [13]:
from sklearn.preprocessing import MultiLabelBinarizer,StandardScaler

In [14]:
mlb = MultiLabelBinarizer()
scaler = StandardScaler()

In [15]:
anime['genre_encoded'] = mlb.fit_transform(anime['genre'].str.split(',')).tolist()
anime['rating_encoded'] = scaler.fit_transform(anime['rating'].values.reshape(-1, 1))

In [16]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,genre_encoded,rating_encoded
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.847038
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,"[0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",2.738883
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",2.729051
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.650393
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",2.64056


#Recommendation System

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
# Function to recommend similar anime
def recommend_anime(target_anime,df,threshold=0.8):
    target_index = df[df['name'] == target_anime].index[0]
    similarity_scores = cosine_similarity(df['genre_encoded'].tolist())
    target_similarity = similarity_scores[target_index]
    recommended_indices = np.argsort(target_similarity)[::-1][1:]
    recommended_anime = []
    for idx in recommended_indices:
        if target_similarity[idx] >= threshold:
            recommended_anime.append(df.iloc[idx]['name'])
    return recommended_anime

In [20]:
print(recommend_anime('Naruto',anime))

['Naruto', 'Boruto: Naruto the Movie', 'Naruto: Shippuuden', 'Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi', 'Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!', 'Naruto Shippuuden: Sunny Side Battle', 'Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono', 'Naruto: Shippuuden Movie 4 - The Lost Tower', 'Katekyo Hitman Reborn!', 'Kyutai Panic Adventure!', 'Battle Spirits: Ryuuko no Ken', 'Dragon Ball Super', 'Dragon Ball Z Movie 15: Fukkatsu no F', 'Medaka Box', 'Dragon Ball GT: Goku Gaiden! Yuuki no Akashi wa Suushinchuu', 'Dragon Ball Kai', 'Dragon Ball Z Movie 11: Super Senshi Gekiha!! Katsu no wa Ore da', 'Dragon Ball Kai (2014)', 'Medaka Box Abnormal', 'Dragon Ball Z: Summer Vacation Special', 'Dragon Ball Z', 'Dragon Ball Z: Atsumare! Gokuu World', 'Tenjou Tenge']


In [21]:
def evaluate_recommendations(true_anime_list,recommended_list):
    true_pos = len(set(true_anime_list) & (set(recommended_list)))
    precision = true_pos / len(recommended_list)
    recall = true_pos / len(true_anime_list)
    f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1

In [26]:
true_anime = ['Naruto']
recommended_anime = recommend_anime('Naruto',anime)

In [27]:
precision, recall, f1 = evaluate_recommendations(true_anime, recommended_anime)
print(f'Precision: {precision}, Recall: {recall}, F1-score: {f1}')

Precision: 0.043478260869565216, Recall: 1.0, F1-score: 0.08333333333333333


# Collaborative Filtering
    - Collaborative filtering is a method of making automatic predictions about a user's interests by collecting preferences from many users. The underlying principle is that if two users have a history of agreeing on certain items, they are likely to agree on other items as well. It can be broadly classified into two types: user-based and item-based collaborative filtering.
    
 - Collaborative filtering works by analyzing patterns in user-item interactions.
 - Collaborative filtering is widely used in recommendation systems like Netflix, Amazon, and Spotify, where it helps predict what users will like based on their past actions or the actions of others.

# User-Based Collaborative Filtering
    - User-Based collaborative filtering recommends items to a user by finding similar users who have similar preferences.

How it works:
 - It calculates the similarity between users based on their ratings or interactions with items. Common methods include cosine similarity or Pearson correlation.
 - Once similar users sre identified, the system recommends items that those similar users liked but that the target users has not yet interacted with.

Pros:
  - Can provide personalized recommendations by comapring users with similar taskes.
  - Works well whrn users have rated many items, allowing the system to make meaningful comaprisions between users.

Cons:
  - Finding similar users can be computationally expensive, especially in large datasets.
  - When a new user joins, the system has no prior data about them, so it struggles to find similar users.

# Item-Based Collaborative Filtering
    -  Item-based collaborative filtering recommends items based on the similarity between items rather than users.

How it Works:
- It calculates the similarity between items based on how users have rated them. For instance, if two items are rated similarly by many users, they are considered similar.
- The system recommends items that are similar to those that the user has already liked or rated highly.

Pros:
- Item-based filtering can be faster and easier to scale because the number of items is usually smaller than the number of users, and item similarities are more stable over time.
- Even if a user is new and hasn't rated many items, as long as they have rated or interacted with a few, the system can still recommend items based on those interactions.

Cons:
- If items are sparsely rated (i.e., not many users have rated them), it may be hard to find reliable similarity scores between items.