# ***16.RECOMMENDATION SYSTEM***

In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
anim=pd.read_csv("anime.csv")

#### Data Preprocessing

In [5]:
anim.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [6]:
anim.shape

(12294, 7)

In [7]:
anim.dtypes

Unnamed: 0,0
anime_id,int64
name,object
genre,object
type,object
episodes,object
rating,float64
members,int64


In [8]:
len(anim.anime_id.unique())

12294

In [9]:
len(anim.name.unique())

12292

In [10]:
len(anim.rating.unique())

599

In [11]:
len(anim.genre.unique())

3265

In [12]:
len(anim.type.unique())

7

In [13]:
anim.isna().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [14]:
anim.duplicated().sum()

np.int64(0)

In [15]:
#treating outliers in rating by mean
rating_me=anim.rating.mean()
anim['rating']=anim['rating'].fillna(rating_me)

In [16]:
#dropping null values
anim.dropna(inplace=True)

In [17]:
anim.isna().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,0
members,0


In [18]:
anim.shape

(12210, 7)

#### Feature Extraction

In [19]:
anim.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [20]:
#we will do categorical encoding for genre and type
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
anim['genre']=le.fit_transform(anim['genre'])
anim['type']=le.fit_transform(anim['type'])

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
# Convert the 'genre' column to string type before applying .str accessor
anim['genre'] = anim['genre'].astype(str).str.replace(', ', ' ')
genre_matrix = tfidf_vectorizer.fit_transform(anim['genre'])

In [22]:
 #Normalize 'rating' and 'members' numerical features
# Scale numerical features to a 0-1 range to give them a comparable weight to genre features.
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

anim[['rating', 'members']] = scaler.fit_transform(
    anim[['rating', 'members']])


#### Recommendation System

1.Item Based

In [23]:
#cosine similarity
df1=anim.pivot_table(index='name',columns='genre',values='rating')
df1.fillna(0,axis=1,inplace=True)
df1

genre,0,1,10,100,1000,1001,1002,1003,1004,1005,...,990,991,992,993,994,995,996,997,998,999
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;0&quot;,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC Rou,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xxxHOLiC Shunmuki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Üks Uks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ēlDLIVE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
cos=cosine_similarity(df1)
cos.shape

(12208, 12208)

In [25]:
def recommended_anime(similar_anime):
  if similar_anime in  df1.index:
    index=np.where(similar_anime==df1.index)[0][0]
    similar=sorted(list(enumerate(cos[index])),reverse=True,key=lambda x:x[1])
    print('Recommended anime of',similar_anime)
    print('*'*30)
    for i in similar:
      if i[1]>0:
        print(df1.index[i[0]])
  else:

    print('It has no similarity with other anime')
#here we develop recommendation system based on genre and anime


In [26]:
recommended_anime('Naruto')
# here it gives other anime movie similar to genre of naruto anime


Recommended anime of Naruto
******************************
Boruto: Naruto the Movie
Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi
Naruto
Naruto Shippuuden: Sunny Side Battle
Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!
Naruto x UT
Naruto: Shippuuden
Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono
Naruto: Shippuuden Movie 4 - The Lost Tower


In [27]:
#cosine similarity
df2=anim.pivot_table(index='genre',columns='type',values='rating')
df2.fillna(0,axis=1,inplace=True)
df2

type,0,1,2,3,4,5
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.446512,0.0,0.533290,0.516425,0.510604,0.532293
1,0.464286,0.0,0.403361,0.494998,0.489196,0.682113
10,0.000000,0.0,0.000000,0.000000,0.000000,0.660264
100,0.000000,0.0,0.000000,0.000000,0.000000,0.607443
1000,0.599373,0.0,0.000000,0.569628,0.000000,0.697479
...,...,...,...,...,...,...
995,0.708283,0.0,0.000000,0.721489,0.000000,0.753902
996,0.000000,0.0,0.000000,0.000000,0.000000,0.576230
997,0.000000,0.0,0.000000,0.432173,0.000000,0.000000
998,0.631453,0.0,0.000000,0.000000,0.000000,0.000000


In [28]:
cos1=cosine_similarity(df2)
cos1.shape

(3260, 3260)

In [29]:
def recommended_genre(similar_genre):
  if similar_genre in  df2.index:
    index=np.where(similar_genre==df2.index)[0][0]
    similar1=sorted(list(enumerate(cos1[index])),reverse=True,key=lambda x:x[1])
    print('Recommended anime genre of',similar_genre)
    print('*'*30)
    # Limit to top 5 recommendations
    for i in similar1[1:6]: # Start from 1 to exclude the anime itself
      if i[1]>0:
        print(df2.index[i[0]])
  else:
    print('It has no similarity with other genre')

In [30]:
recommended_genre('Thriller')
#here we get other genre based on Type of broadcast similar to Thriller

It has no similarity with other genre


In [31]:
#cosine similarity
df3=anim.pivot_table(index='name',columns='type',values='rating')
df3.fillna(0,axis=1,inplace=True)
df3

type,0,1,2,3,4,5
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
&quot;0&quot;,0.000000,0.406963,0.000000,0.000000,0.0,0.000000
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",0.399760,0.000000,0.000000,0.000000,0.0,0.000000
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.000000,0.000000,0.000000,0.647059,0.0,0.000000
&quot;Bungaku Shoujo&quot; Memoire,0.000000,0.000000,0.000000,0.704682,0.0,0.000000
&quot;Bungaku Shoujo&quot; Movie,0.715486,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...
xxxHOLiC Rou,0.000000,0.000000,0.000000,0.798319,0.0,0.000000
xxxHOLiC Shunmuki,0.000000,0.000000,0.000000,0.780312,0.0,0.000000
Üks Uks,0.540216,0.000000,0.000000,0.000000,0.0,0.000000
ēlDLIVE,0.000000,0.000000,0.000000,0.000000,0.0,0.576699


In [32]:
cos2=cosine_similarity(df3)
cos2.shape

(12208, 12208)

In [33]:
def recommended_anime1(similar_type):
  if similar_type in  df3.index:
    index=np.where(similar_type==df3.index)[0][0]
    similar2=sorted(list(enumerate(cos2[index])),reverse=True,key=lambda x:x[1])
    print('Recommended anime of',similar_type)
    print('*'*30)
    # Limit to top 10 recommendations
    for i in similar2[1:11]: # Start from 1 to exclude the anime itself
      if i[1]>0:
        print(df3.index[i[0]])
  else:
    print('It has no similarity with other anime')

In [34]:
#here we want to get similar anime which is broadcasted on similar type based on rating
recommended_anime1('Naruto')

Recommended anime of Naruto
******************************
.hack//Sign
.hack//Tasogare no Udewa Densetsu
0-sen Hayato
009-1
07-Ghost
11eyes
12-sai.: Chicchana Mune no Tokimeki
12-sai.: Chicchana Mune no Tokimeki 2nd Season
2020 Nyeon Ujuui Wonder Kiddy
21 Emon


Sparsre Matrix
Sparse matrices are matrices that contain a large number of zero values. They are commonly encountered in machine learning, especially when dealing with text data (like with TF-IDF) or recommender systems. Handling sparse matrices efficiently is crucial for memory and computational performance.

In [37]:
# Check if the genre_matrix generated by TfidfVectorizer is sparse
print(f"Type of genre_matrix: {type(genre_matrix)}")
print(f"Shape of genre_matrix: {genre_matrix.shape}")
print(f"Number of non-zero elements in genre_matrix: {genre_matrix.nnz}")

# Calculating the sparsity ratio
total_elements = genre_matrix.shape[0] * genre_matrix.shape[1]
sparsity_ratio = (1 - (genre_matrix.nnz / total_elements)) * 100
print(f"Sparsity ratio of genre_matrix: {sparsity_ratio:.2f}%")



Type of genre_matrix: <class 'scipy.sparse._csr.csr_matrix'>
Shape of genre_matrix: (12210, 3250)
Number of non-zero elements in genre_matrix: 12124
Sparsity ratio of genre_matrix: 99.97%


In [38]:
from sklearn.decomposition import TruncatedSVD

# Initialize TruncatedSVD with a chosen number of components
# The number of components should typically be less than the number of features (genres in this case)
# or the number of samples (anime).
n_components = 100
svd_model = TruncatedSVD(n_components=n_components, random_state=42)

# Fit SVD to the genre_matrix and transform it
genre_svd = svd_model.fit_transform(genre_matrix)

print(f"Original genre_matrix shape: {genre_matrix.shape}")
print(f"Reduced genre_svd shape: {genre_svd.shape}")

#(how much variance is explained by the selected components)
print(f"Explained variance ratio (sum): {svd_model.explained_variance_ratio_.sum():.2f}")

# We can now use 'genre_svd' for further analysis or similarity calculations
# For example, calculating cosine similarity on the SVD-reduced matrix:
from sklearn.metrics.pairwise import cosine_similarity
svd_cosine_sim = cosine_similarity(genre_svd)
print(f"Shape of SVD-reduced cosine similarity matrix: {svd_cosine_sim.shape}")

Original genre_matrix shape: (12210, 3250)
Reduced genre_svd shape: (12210, 100)
Explained variance ratio (sum): 0.45
Shape of SVD-reduced cosine similarity matrix: (12210, 12210)


In [39]:
anime_id_df = anim.pivot(index='anime_id', columns='name', values='rating')


In [40]:
#Impute those NaNs with 0 values
anime_id_df.fillna(0, inplace=True)

In [41]:
#Calculating Cosine Similarity between Users
from sklearn.metrics import pairwise_distances
#from scipy.spatial.distance import cosine, correlation

In [42]:
anim_id_sim = 1-pairwise_distances( anime_id_df.values,metric='cosine')

In [44]:
#Store the results in a dataframe
anim_sim_df = pd.DataFrame(anim_id_sim)
anim_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12200,12201,12202,12203,12204,12205,12206,12207,12208,12209
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [45]:
#Set the index and column names to anime ids
anim_sim_df.index = anim.anime_id.unique()
anim_sim_df.columns = anim.anime_id.unique()

In [46]:
anim_sim_df.iloc[0:10, 0:10]

Unnamed: 0,32281,5114,28977,9253,9969,32935,11061,820,15335,15417
32281,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5114,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28977,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9253,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9969,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
32935,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
11061,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [47]:
#Most anim id
anim_sim_df.idxmax(axis=1).head(20)

Unnamed: 0,0
32281,32281
5114,5114
28977,28977
9253,9253
9969,9969
32935,32935
11061,11061
820,820
15335,15335
15417,15417


In [51]:
def recommend_movies(anime_id, anim_sim_df, anim, num_recommendations=5):
    # Ensure the anime_id exists in the similarity matrix index
    if anime_id not in anim_sim_df.index:
        print(f"Anime ID {anime_id} not found in the similarity matrix.")
        return []

    # Get the similarity scores for the given anime_id
    # Sort them in descending order and get the indices (anime_ids)
    # Exclude the anime itself (similarity score of 1.0 with itself)
    similar_animes_scores = anim_sim_df.loc[anime_id].sort_values(ascending=False)

    # Get the top N similar anime IDs, excluding the input anime_id itself
    # We take num_recommendations + 1 to account for the anime itself, then drop it
    top_similar_anime_ids = similar_animes_scores.index[1:num_recommendations+1]

    # Retrieve the names of these similar anime from the original 'anim' DataFrame
    recommended_anime_names = anim[anim['anime_id'].isin(top_similar_anime_ids)]['name'].tolist()

    return recommended_anime_names

In [52]:
user_id_to_recommend = int(input("enter anime id")) # Changed prompt to 'anime id'
recommendations = recommend_movies(user_id_to_recommend, anim_sim_df, anim, 4)
print(f"Recommended anime for anime ID {user_id_to_recommend}: {recommendations}")

enter anime id199
Recommended anime for anime ID 199: ['Apache Yakyuugun', 'Apo Apo World: Giant Baba 90-bun 1-hon Shoubu', 'Aqua Kids', 'Araiguma Rascal Specials']


INTERVIEW QUESTIONS:

1.
ANSWER:-
User-Based Collaborative Filtering (UBCF)

Core Concept: Recommends items by finding users with similar tastes.

How it works: The system identifies "neighbors" (users who have rated items similarly to you). If a neighbor liked a movie you haven't seen, the system recommends it to you.

Strengths: Often provides more diverse and serendipitous recommendations, as it can introduce users to entirely new categories based on what their "tastes-mates" enjoy.

Weaknesses: Harder to scale. As the number of users grows (e.g., millions of users on a platform), calculating real-time similarity between everyone becomes computationally expensive. User preferences also change more frequently than item characteristics, requiring constant recalculation.

Item-Based Collaborative Filtering (IBCF)

Core Concept: Recommends items similar to those you have already interacted
with.

How it works: The system identifies relationships between items based on how users collectively rate them. If 80% of people who bought a laptop also bought a mouse, the system deems these items "similar" and recommends the mouse to new laptop buyers.

Strengths: Highly scalable and stable. Because the number of items in a catalog usually grows slower than the number of users, similarity matrices can be precomputed offline. Item-item relationships are also more permanent; a classic book's "similarity" to other books rarely shifts over time.

Weaknesses: Can lead to "narrow" recommendations where users only see things very similar to what they have already bought, potentially limiting discovery.


2.
ANSWER:-
Collaborative filtering remains the primary method for making personalized recommendations by analyzing collective user behaviors rather than item descriptions . It operates on the principle that if two users agreed in the past, they will likely share similar tastes in the future.

User-based filtering recommends items by finding "taste-mates"—users with similar rating patterns—and suggesting what they liked. Item-based filtering instead identifies items that are frequently liked together by the same people . While user-based systems offer more variety, they struggle to scale as user populations grow. Item-based systems are more stable and efficient for large platforms because item relationships change less often than human preferences.

Modern systems often use matrix factorization to uncover hidden patterns in sparse data to predict what a user might enjoy. However, these methods face "cold start" challenges when new users or items have no historical data to analyze.