In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

# 1.Data Preprocessing:


In [2]:
df = pd.read_csv('anime.csv')
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [5]:
# Handle Missing Values
df['type'] = df['type'].fillna('type')
df.dropna(subset=['anime_id', 'type','rating',], inplace=True) 
df['type'] 

0        Movie
1           TV
2           TV
3           TV
4           TV
         ...  
12289      OVA
12290      OVA
12291      OVA
12292      OVA
12293    Movie
Name: type, Length: 12064, dtype: object

In [6]:
df['genre'] = df['genre'].fillna('unkown')
df['genre']

0                     Drama, Romance, School, Supernatural
1        Action, Adventure, Drama, Fantasy, Magic, Mili...
2        Action, Comedy, Historical, Parody, Samurai, S...
3                                         Sci-Fi, Thriller
4        Action, Comedy, Historical, Parody, Samurai, S...
                               ...                        
12289                                               Hentai
12290                                               Hentai
12291                                               Hentai
12292                                               Hentai
12293                                               Hentai
Name: genre, Length: 12064, dtype: object

In [7]:
# Summary statistics and info
df.describe()

Unnamed: 0,anime_id,rating,members
count,12064.0,12064.0,12064.0
mean,13704.476044,6.473902,18279.52
std,11260.369521,1.026746,55275.78
min,1.0,1.67,12.0
25%,3409.25,5.88,221.0
50%,10004.0,6.57,1539.0
75%,23863.5,7.18,9485.5
max,34519.0,10.0,1013917.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12064 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12064 non-null  int64  
 1   name      12064 non-null  object 
 2   genre     12064 non-null  object 
 3   type      12064 non-null  object 
 4   episodes  12064 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12064 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 754.0+ KB


In [9]:
# Preprocess Genres
df['genre'] = df['genre'].str.split(', ')
df['genre']

0                   [Drama, Romance, School, Supernatural]
1        [Action, Adventure, Drama, Fantasy, Magic, Mil...
2        [Action, Comedy, Historical, Parody, Samurai, ...
3                                       [Sci-Fi, Thriller]
4        [Action, Comedy, Historical, Parody, Samurai, ...
                               ...                        
12289                                             [Hentai]
12290                                             [Hentai]
12291                                             [Hentai]
12292                                             [Hentai]
12293                                             [Hentai]
Name: genre, Length: 12064, dtype: object

In [10]:
mlb = MultiLabelBinarizer()
mlb

In [11]:
genres_encoded = mlb.fit_transform(df['genre'])
genres_encoded

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
# One-Hot Encode Broadcast Type
broadcast_encoded = pd.get_dummies(df['anime_id'], prefix='broadcast')
broadcast_encoded

Unnamed: 0,broadcast_1,broadcast_5,broadcast_6,broadcast_7,broadcast_8,broadcast_15,broadcast_16,broadcast_17,broadcast_18,broadcast_19,...,broadcast_34412,broadcast_34447,broadcast_34453,broadcast_34464,broadcast_34475,broadcast_34476,broadcast_34490,broadcast_34503,broadcast_34514,broadcast_34519
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12290,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12291,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12292,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
# Scale Numerical Features
scaler = MinMaxScaler()
scaler

In [14]:
df['rating_scaled'] = scaler.fit_transform(df[['rating']])
df['rating_scaled'] 

0        0.924370
1        0.911164
2        0.909964
3        0.900360
4        0.899160
           ...   
12289    0.297719
12290    0.313325
12291    0.385354
12292    0.397359
12293    0.454982
Name: rating_scaled, Length: 12064, dtype: float64

In [15]:
df['community_members'] = scaler.fit_transform(df[['members']])
df['community_members'] 

0        0.197867
1        0.782769
2        0.112683
3        0.664323
4        0.149180
           ...   
12289    0.000196
12290    0.000169
12291    0.000204
12292    0.000161
12293    0.000128
Name: community_members, Length: 12064, dtype: float64

In [16]:
# Combine Features
features = np.hstack((
    genres_encoded,
    broadcast_encoded.values,
    df[['rating_scaled', 'community_members']].values
))
features

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.24369748e-01, 1.97866664e-01],
       [1.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.11164466e-01, 7.82768603e-01],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.09963986e-01, 1.12683141e-01],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 3.85354142e-01, 2.04161139e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 3.97358944e-01, 1.60764569e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 4.54981993e-01, 1.28217141e-04]])

In [17]:
# Compute Cosine Similarity
cosine_sim = cosine_similarity(features)
cosine_sim

array([[1.        , 0.26770919, 0.11961814, ..., 0.10011446, 0.10300553,
        0.1166201 ],
       [0.26770919, 1.        , 0.31929191, ..., 0.07798897, 0.08023463,
        0.0908323 ],
       [0.11961814, 0.31929191, 1.        , ..., 0.08046391, 0.08278845,
        0.0937319 ],
       ...,
       [0.10011446, 0.07798897, 0.08046391, ..., 1.        , 0.53554191,
        0.53974677],
       [0.10300553, 0.08023463, 0.08278845, ..., 0.53554191, 1.        ,
        0.54107319],
       [0.1166201 , 0.0908323 , 0.0937319 , ..., 0.53974677, 0.54107319,
        1.        ]])

In [18]:
# Recommendation Funct
def recommend_anime(type, top_n=5):
    idx = df[df['type'] == type].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i[0] for i in sim_scores[1:top_n+1]]
    return df.iloc[top_indices][['type', 'rating', 'genre', 'anime_id']]


In [19]:
 recommend_anime

<function __main__.recommend_anime(type, top_n=5)>

In [20]:
# Handle missing values
missing=df.dropna(subset=["name", "genre", "rating"], inplace=True)
missing

In [21]:
# Extract necessary featres
df['genre'] = df['genre'].fillna('Unknown')  # Fill missing genres
df['genre']

0                   [Drama, Romance, School, Supernatural]
1        [Action, Adventure, Drama, Fantasy, Magic, Mil...
2        [Action, Comedy, Historical, Parody, Samurai, ...
3                                       [Sci-Fi, Thriller]
4        [Action, Comedy, Historical, Parody, Samurai, ...
                               ...                        
12289                                             [Hentai]
12290                                             [Hentai]
12291                                             [Hentai]
12292                                             [Hentai]
12293                                             [Hentai]
Name: genre, Length: 12064, dtype: object

In [22]:
df['rating'] = df['rating'].fillna(df['rating'].mean())  # Fill missing ratings with mean
df['rating']

0        9.37
1        9.26
2        9.25
3        9.17
4        9.16
         ... 
12289    4.15
12290    4.28
12291    4.88
12292    4.98
12293    5.46
Name: rating, Length: 12064, dtype: float64

#  2. Feature Extraction

In [23]:
# creat a data frame
df=pd.DataFrame(df)
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,rating_scaled,community_members
0,32281,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",Movie,1,9.37,200630,0.924370,0.197867
1,5114,Fullmetal Alchemist: Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic, Mil...",TV,64,9.26,793665,0.911164,0.782769
2,28977,Gintama°,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.25,114262,0.909964,0.112683
3,9253,Steins;Gate,"[Sci-Fi, Thriller]",TV,24,9.17,673572,0.900360,0.664323
4,9969,Gintama&#039;,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.16,151266,0.899160,0.149180
...,...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,[Hentai],OVA,1,4.15,211,0.297719,0.000196
12290,5543,Under World,[Hentai],OVA,1,4.28,183,0.313325,0.000169
12291,5621,Violence Gekiga David no Hoshi,[Hentai],OVA,4,4.88,219,0.385354,0.000204
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,[Hentai],OVA,1,4.98,175,0.397359,0.000161


In [24]:
# One-hot encoding for the 'genre' column
df_genres = df['genre'].str.get_dummies(sep=',')
df = pd.concat([df, df_genres], axis=1)

In [25]:
df_genres

Unnamed: 0,'Adventure','Adventure'],'Cars','Cars'],'Comedy','Comedy'],'Dementia','Dementia'],'Demons','Demons'],...,['Slice of Life',['Slice of Life'],['Space'],['Sports'],['Super Power',['Supernatural'],['Thriller'],['Vampire'],['Yaoi'],['unkown']
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,rating_scaled,community_members,'Adventure',...,['Slice of Life',['Slice of Life'],['Space'],['Sports'],['Super Power',['Supernatural'],['Thriller'],['Vampire'],['Yaoi'],['unkown']
0,32281,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",Movie,1,9.37,200630,0.924370,0.197867,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic, Mil...",TV,64,9.26,793665,0.911164,0.782769,1,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.25,114262,0.909964,0.112683,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"[Sci-Fi, Thriller]",TV,24,9.17,673572,0.900360,0.664323,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,Gintama&#039;,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.16,151266,0.899160,0.149180,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,[Hentai],OVA,1,4.15,211,0.297719,0.000196,0,...,0,0,0,0,0,0,0,0,0,0
12290,5543,Under World,[Hentai],OVA,1,4.28,183,0.313325,0.000169,0,...,0,0,0,0,0,0,0,0,0,0
12291,5621,Violence Gekiga David no Hoshi,[Hentai],OVA,4,4.88,219,0.385354,0.000204,0,...,0,0,0,0,0,0,0,0,0,0
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,[Hentai],OVA,1,4.98,175,0.397359,0.000161,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
scaler = MinMaxScaler()
scaler

In [28]:
# Normalize rating and number of episodes
scaler.fit_transform(df[['rating','members']])

array([[9.24369748e-01, 1.97866664e-01],
       [9.11164466e-01, 7.82768603e-01],
       [9.09963986e-01, 1.12683141e-01],
       ...,
       [3.85354142e-01, 2.04161139e-04],
       [3.97358944e-01, 1.60764569e-04],
       [4.54981993e-01, 1.28217141e-04]])

In [29]:
# Encode genres using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf

In [30]:
tfidf_matrix = tfidf.fit_transform(df['name'])
tfidf_matrix

<12064x11857 sparse matrix of type '<class 'numpy.float64'>'
	with 42169 stored elements in Compressed Sparse Row format>

In [31]:
 #Combine features (genres + ratings + episodes)
feature_matrix = pd.concat([pd.DataFrame(tfidf_matrix.toarray()), df[['rating', 'episodes']].reset_index(drop=True)], axis=1)
feature_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11849,11850,11851,11852,11853,11854,11855,11856,rating,episodes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.37,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.26,64
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.25,51
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.17,24
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.16,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.15,1
12060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.28,1
12061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.88,4
12062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.98,1


# 3. Recommendation System

##### 1.Compute Cosine Similarity:

In [32]:
scaler = StandardScaler()
scaler

In [33]:
# Prepare the features to be used for similarity (genres, rating, and episodes)
features = df[['rating', 'episodes']].join(df[df_genres.columns])
features

Unnamed: 0,rating,episodes,'Adventure','Adventure'],'Cars','Cars'],'Comedy','Comedy'],'Dementia','Dementia'],...,['Slice of Life',['Slice of Life'],['Space'],['Sports'],['Super Power',['Supernatural'],['Thriller'],['Vampire'],['Yaoi'],['unkown']
0,9.37,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9.26,64,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9.25,51,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9.17,24,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9.16,51,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,4.15,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,4.28,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,4.88,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,4.98,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
cosine_sim

array([[1.        , 0.26770919, 0.11961814, ..., 0.10011446, 0.10300553,
        0.1166201 ],
       [0.26770919, 1.        , 0.31929191, ..., 0.07798897, 0.08023463,
        0.0908323 ],
       [0.11961814, 0.31929191, 1.        , ..., 0.08046391, 0.08278845,
        0.0937319 ],
       ...,
       [0.10011446, 0.07798897, 0.08046391, ..., 1.        , 0.53554191,
        0.53974677],
       [0.10300553, 0.08023463, 0.08278845, ..., 0.53554191, 1.        ,
        0.54107319],
       [0.1166201 , 0.0908323 , 0.0937319 , ..., 0.53974677, 0.54107319,
        1.        ]])

##### 2.Recommend anime based on cosine similarity

In [35]:
def recommend_anime(target_anime_id, cosine_sim, top_n=5):
    # Get the index of the target anime
    target_idx = df[df['anime_id'] == target_anime_id].index[0]
    
    # Get the cosine similarity scores for the target anime
    sim_scores = list(enumerate(cosine_sim[target_idx]))
    
    # Sort the anime by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top N most similar anime
    top_similar_anime = sim_scores[1:top_n+1]  # Excluding the target anime itself (index 0)
    
    # Get the anime titles for the top N similar anime
    recommended_anime = [df.iloc[i[0]]['title'] for i in top_similar_anime]
    
    return recommended_anime

In [36]:
recommend_anime

<function __main__.recommend_anime(target_anime_id, cosine_sim, top_n=5)>

In [37]:
top_n=5
top_n

5

# 4. Evaluation

#### 1.Train-test split:

In [38]:
# Splitting data into train and test (simplified, actual split depends on availability of user ratings)
train, test = train_test_split(df, test_size=0.2, random_state=42)
train, test

(       anime_id                                               name  \
 1374      31553                        Charlotte: Tsuyoi Monotachi   
 3118       1925       Urusei Yatsura Movie 6: Itsudatte My Darling   
 11559     10392                                           Pet Life   
 3780       8754                Tales of the Abyss Special Fan Disc   
 11152      5097  Hatsu Inu 2 The Animation: Strange Kind of Wom...   
 ...         ...                                                ...   
 12184      3566              Hika Ryoujoku: Wana ni Hamatta Futari   
 5191       5272                        Tondemo Nezumi Daikatsuyaku   
 5390       1262                           Macross II: Lovers Again   
 860       22819                                     Aikatsu! Movie   
 7270       2364                          Virus: Virus Buster Serge   
 
                                                    genre     type episodes  \
 1374                               [School, Super Power]  Special 

#### 2.Evaluation metrics:

In [39]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Generate recommendations for the test set (for example, by comparing recommended with actual ratings)
# (Note: This is a simplified example, real-world evaluation is more complex)
y_true = df['anime_id']
y_pred = [recommend_anime(anime_id, type) for anime_id in y_true]  # Predicted anime IDs


In [40]:
y_true

0        32281
1         5114
2        28977
3         9253
4         9969
         ...  
12289     9316
12290     5543
12291     5621
12292     6133
12293    26081
Name: anime_id, Length: 12064, dtype: int64

In [41]:
# Calculate precision, recall, F1 score
precision =(y_true)
precision

0        32281
1         5114
2        28977
3         9253
4         9969
         ...  
12289     9316
12290     5543
12291     5621
12292     6133
12293    26081
Name: anime_id, Length: 12064, dtype: int64

In [42]:
recall = (y_true,)
recall

(0        32281
 1         5114
 2        28977
 3         9253
 4         9969
          ...  
 12289     9316
 12290     5543
 12291     5621
 12292     6133
 12293    26081
 Name: anime_id, Length: 12064, dtype: int64,)

In [43]:
f1 = (y_true)
f1

0        32281
1         5114
2        28977
3         9253
4         9969
         ...  
12289     9316
12290     5543
12291     5621
12292     6133
12293    26081
Name: anime_id, Length: 12064, dtype: int64

### Adjusting Similarity Thresholds

In [44]:
def recommend_with_threshold(target_anime_id, cosine_sim, threshold=0.8):
    # Get the index of the target anime
    target_idx = df[df['anime_id'] == target_anime_id].index[0]
    
    # Get the cosine similarity scores for the target anime
    sim_scores = list(enumerate(cosine_sim[target_idx]))
    
    # Filter anime by the threshold similarity score
    filtered_sim_scores = [x for x in sim_scores if x[1] >= threshold]
    
    # Get the indices of the filtered anime
    recommended_anime = [df.iloc[i[0]]['title'] for i in filtered_sim_scores]
    
    return recommended_anime

In [45]:
recommend_with_threshold

<function __main__.recommend_with_threshold(target_anime_id, cosine_sim, threshold=0.8)>

In [46]:
cosine_sim

array([[1.        , 0.26770919, 0.11961814, ..., 0.10011446, 0.10300553,
        0.1166201 ],
       [0.26770919, 1.        , 0.31929191, ..., 0.07798897, 0.08023463,
        0.0908323 ],
       [0.11961814, 0.31929191, 1.        , ..., 0.08046391, 0.08278845,
        0.0937319 ],
       ...,
       [0.10011446, 0.07798897, 0.08046391, ..., 1.        , 0.53554191,
        0.53974677],
       [0.10300553, 0.08023463, 0.08278845, ..., 0.53554191, 1.        ,
        0.54107319],
       [0.1166201 , 0.0908323 , 0.0937319 , ..., 0.53974677, 0.54107319,
        1.        ]])

In [47]:
threshold=0.85
threshold

0.85

In [48]:
def recommend_with_threshold(target_anime_id, cosine_sim, threshold=0.8):
    
    # Get the index of the target anime
    target_idx = df[df['anime_id'] == target_anime_id].index[0]
    sim_scores = list(enumerate(cosine_sim[target_idx]))
    filtered_sim_scores = [x for x in sim_scores if x[1] >= threshold]
    recommended_anime = [df.iloc[i[0]]['title'] for i in filtered_sim_scores]
    return recommended_anime

In [49]:
recommend_with_threshold

<function __main__.recommend_with_threshold(target_anime_id, cosine_sim, threshold=0.8)>

In [50]:
cosine_sim

array([[1.        , 0.26770919, 0.11961814, ..., 0.10011446, 0.10300553,
        0.1166201 ],
       [0.26770919, 1.        , 0.31929191, ..., 0.07798897, 0.08023463,
        0.0908323 ],
       [0.11961814, 0.31929191, 1.        , ..., 0.08046391, 0.08278845,
        0.0937319 ],
       ...,
       [0.10011446, 0.07798897, 0.08046391, ..., 1.        , 0.53554191,
        0.53974677],
       [0.10300553, 0.08023463, 0.08278845, ..., 0.53554191, 1.        ,
        0.54107319],
       [0.1166201 , 0.0908323 , 0.0937319 , ..., 0.53974677, 0.54107319,
        1.        ]])

## Interview Questions:

### 1. Can you explain the difference between user-based and item-based collaborative filtering?

In [51]:
### User-Based Collaborative Filtering

#### Focus: 
##### Compares users to identify similar preferences.
#### Key Idea:
##### Users who have shown similar behavior (e.g., rated items similarly or purchased similar products) are likely to have similar preferences in the future.
#### Approach:
##### 1.Calculate similarity between users based on their interactions (e.g., using cosine similarity, Pearson correlation, etc.).
##### 2.Recommend items that similar users have interacted with but the current user has not.
#### Example: 
##### If User A and User B both like movies X and Y, and User B likes movie Z, the system may recommend movie Z to User A.
#### Strengths:
##### 1.Effective in domains with many overlapping user preferences
#####  2.Leverages group behavior to predict individual preferences.
#### Weaknesses:
#####  1.Struggles with sparse data (when users have interacted with only a few items)
#####  2.Computationally expensive for large datasets as it involves comparing all users.


In [52]:
## Item-Based Collaborative Filtering

#### Focus:
##### Compares items to identify similar characteristics.
#### Key Idea: 
##### Items that are rated or interacted with similarly by users are likely to be similar in nature.
#### Approach:
#####   1.Calculate similarity between items based on user interactions (e.g., ratings or purchase history). 
#####   2.Recommend items similar to those the user has already interacted with
#### Example: 
##### If many users who bought product A also bought product B, and a new user buys product A, the system may recommend product B.
#### Strengths:
#####   1.Works well in domains where user behavior is sparse, as it focuses on items rather than users.
#####   2.More stable over time since item relationships are less dynamic than user relationships.
#####   3.Efficient for large datasets due to the relatively smaller number of items compared to users.
#### Weaknesses:
#####   1.Struggles with cold start for new items, as there’s no historical data to determine similarity.
#####   2.May not capture unique, individual preferences as effectively as user-based approaches.
                    

## 2.What is collaborative filtering, and how does it work?

##### Collaborative filtering is a technique used in recommendation systems that suggests items to users based on their past interactions and the behavior of other users. It operates under the assumption that users who have had similar preferences in the past will continue to do so in the future.

### Types of Collaborative Filtering
#### `1.Memory-Based Collaborative Filtering
#####  Directly uses the user-item interaction matrix.
#####  Divided into two subtypes:
######   User-Based: Finds similar users and recommends items they liked.
######   Item-Based: Finds similar items and recommends those similar to items the user interacted with.
#### How It Works:
##### Measure similarity (e.g., using cosine similarity, Pearson correlation).
##### Make predictions based on neighbors (similar users or items).


#### 2. Model-Based Collaborative Filtering
##### Uses machine learning techniques to model user-item interactions.
##### Builds a predictive model from historical data, such as using
######    Neural Networks (e.g., Autoencoders)
######    Clustering or classification models
#### How It Works:
##### Learns latent factors (hidden relationships) from the user-item matrix.
##### Predicts ratings or interactions for new user-item pairs.


### Steps in Collaborative Filtering
#### 1.Data Collection:
######  Gather interaction data (e.g., ratings, purchases, views) to form a user-item matrix.
#### 2.Similarity Calculation:
######  Compute similarity between users (user-based) or items (item-based) using metrics like:
#### 3.Prediction:
######   Predict missing interactions in the user-item matrix by leveraging the interactions of similar users or items.
#### 4.Recommendation:
######   Recommend top-ranked items with the highest predicted ratings or interaction likelihood.


#### Advantages
##### Domain Independence: Does not require knowledge of item characteristics.
##### Adaptability: Learns from user behavior dynamically.
##### Diversity: Can uncover unexpected recommendations through collaborative patterns.