In [5]:
#loding dataset
import pandas as pd
file_path = r"D:\DS\Recommendation System\anime.csv"
anime_df = pd.read_csv(file_path)
anime_df.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [6]:
#handling missing value
missing_values = anime_df.isnull().sum()
print(missing_values)

anime_df = anime_df.dropna()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [7]:
#getting the basic information of the data
anime_df.info()

#gettinf statistics summary
anime_df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12017 non-null  int64  
 1   name      12017 non-null  object 
 2   genre     12017 non-null  object 
 3   type      12017 non-null  object 
 4   episodes  12017 non-null  object 
 5   rating    12017 non-null  float64
 6   members   12017 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 751.1+ KB


Unnamed: 0,anime_id,rating,members
count,12017.0,12017.0,12017.0
mean,13638.001165,6.478264,18348.88
std,11231.076675,1.023857,55372.5
min,1.0,1.67,12.0
25%,3391.0,5.89,225.0
50%,9959.0,6.57,1552.0
75%,23729.0,7.18,9588.0
max,34519.0,10.0,1013917.0


In [8]:
#Feature Extraction 
#Selecting the feature to use for computing similarilty 
#converting categorical features to nymerical
#converting genre into one-hot encoded features
genres_one_hot = anime_df['genre'].str.get_dummies(sep=',')
anime_df = pd.concat([anime_df, genres_one_hot], axis=1)

In [9]:
#recommendation system
#compute cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

features = genres_one_hot.columns.tolist() + ['rating']
cosine_sim = cosine_similarity(anime_df[features])

In [18]:
def recommend_anime(name, cosine_sim = cosine_sim, df=anime_df, num_recommendations=10):
    #getting the index of the targeted anime 
    idx= df[df['name'] == name].index[0]
    #getting similarity scores for all anime
    sim_scores= list(enumerate(cosine_sim[idx]))
    #sorting anime based on similarity scores
    sim_scores= sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #getting the indices of the top recommendations
    sim_scores = sim_scores[1:num_recommendations+1]
    #getting the titles of the recommended anime
    anime_indices = [i[0] for i in sim_scores]
    return df['name'].iloc[anime_indices]

#example anime similar to Naruto
recommendations = recommend_anime('Naruto')
print(recommendations)

615                                    Naruto: Shippuuden
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
486                              Boruto: Naruto the Movie
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
1573    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
2458                 Naruto Shippuuden: Sunny Side Battle
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
175                                Katekyo Hitman Reborn!
206                                         Dragon Ball Z
Name: name, dtype: object


In [20]:
from sklearn.model_selection import train_test_split

# Split the dataset
train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)


In [27]:
def evaluate_recommendations(target_name, true_names, k=10):
    recommendations = recommend_anime(target_name, num_recommendations=k)
    true_set = set(true_names)
    recommended_set = set(recommendations)
    
    # Calculate precision, recall, and F1-score
    precision = len(true_set & recommended_set) / len(recommended_set) if len(recommended_set) > 0 else 0
    recall = len(true_set & recommended_set) / len(true_set) if len(true_set) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

# Example: Evaluate recommendations for 'Naruto'
precision, recall, f1 = evaluate_recommendations('Naruto', ['Bleach', 'One Piece', 'Fairy Tail'])
print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1}")

Precision: 0.0, Recall: 0.0, F1-score: 0


**Analysis of Recommendation System Performance**

1. Current Performance : Good at identifying related titles but initial evaluation was flawed showing zero precision and recall
2. Refined Evaluation : Use a relevant true set to improve precision and recall metics
3. Improvements:
   a. Diversify recommendations: include varied geners and themes
   b. Feature engineering : Add user rating, reviewa and metadata
   c. Hybrid Approach : combine with content-based filtering
   d.Cold Start Solution : use demographic data ot intital surveys
   e. Scalability : optimize for handling large datasets


**Interview Questions:**
1. Can you explain the difference between user-based and item-based collaborative filtering?
2. What is collaborative filtering, and how does it work?

**Difference Between User-Based and Item-Based Collaborative filtering**

***User-Based:***
1. Concepts: recommends items based on similar users preferences
2. Example : if user A and B have similar tastes, B will get recommendations frim A's Liked items
3. Pros: Personalized can discover new items
4. Cons: Scalability issues cold start problem for new users

***Item-Based***
1. Concepts: Recommends items similar to what a user has liked
2. Example: if a user liked"The Matrix", they'll get recommendations for similar movies
3. pros: More scalable effective for large datasets
4. Cons: May miss novel recommendations needs substantial data

***Collaborative filtering***
1. concept : recommends items based on user prefernces by collecting data from multiple users
2. How it works:
   a. Data Collection : gather user interactions with items
   b. Similarity calculation : find similaritirs between users or items
   c.Prediction : recommend item liked by similar users or similar items
   