In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [5]:
# Load the dataset
df = pd.read_csv("C:\\Data science\\Assignments\\anime.csv")

In [7]:
print(df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [9]:
print(df.isnull().sum())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [34]:
# Handle missing values

# Fill missing values in numerical columns with 0
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(0)

# Fill missing values in categorical columns with an empty string
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('')

In [36]:
# Feature Extraction

# Combine all relevant text features into a single feature
df['features'] = df['genre'] + ' ' + df['type'] + ' ' + df['name']

In [38]:
# Convert the combined text features into numerical representations using TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [20]:
# Recommendation System

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the anime that matches the title
    idx = df.index[df['name'] == title].tolist()[0]

    # Get the pairwise similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar anime
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime
    return df['name'].iloc[anime_indices]

# Example: Recommend anime similar to a given anime title
recommended_anime = get_recommendations('Naruto')
print("\nAnime similar to 'Naruto':")
print(recommended_anime)




Anime similar to 'Naruto':
719                            The Last: Naruto the Movie
615                                    Naruto: Shippuuden
1343                                          Naruto x UT
486                              Boruto: Naruto the Movie
2458                 Naruto Shippuuden: Sunny Side Battle
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
784            Naruto: Shippuuden Movie 6 - Road to Ninja
1930                                    Dragon Ball Super
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
2416    Naruto: Honoo no Chuunin Shiken! Naruto vs. Ko...
Name: name, dtype: object


In [30]:
# Evaluation

# Split the dataset into training and testing sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Generate recommendations for the test set
test['predicted_recommendations'] = test['name'].apply(lambda x: ', '.join(get_recommendations(x)))

print("\nEvaluation Placeholder:")


Evaluation Placeholder:


In [31]:
# Evaluation metrics (assuming we have true positive, false positive, etc.)
# This is a placeholder for actual evaluation, which depends on the available data
precision = precision_score(test['name'], test['predicted_recommendations'], average='macro')
recall = recall_score(test['name'], test['predicted_recommendations'], average='macro')
f1 = f1_score(test['name'], test['predicted_recommendations'], average='macro')

print("\nEvaluation Metrics:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Evaluation Metrics:
Precision: 0.0
Recall: 0.0
F1-Score: 0.0


In [None]:
### Interview Questions and Answers

#### 1. Can you explain the difference between user-based and item-based collaborative filtering?

**Answer:**

- **User-Based Collaborative Filtering**:
  - User-based collaborative filtering finds similar users based on their past behaviors (such as ratings or interactions with items). The idea is that if two users have similar preferences, they will likely like the same items. For a target user, recommendations are generated based on the preferences of users who are similar to them.
  - Example: If User A and User B both like the same movies, and User A likes a new movie, then User B is likely to enjoy that movie as well.

- **Item-Based Collaborative Filtering**:
  - Item-based collaborative filtering, on the other hand, focuses on finding similarities between items. The recommendation is generated based on the similarity of items. If a user likes an item, similar items are recommended to them.
  - Example: If a user likes a specific book, the system recommends books that are similar to it based on past interactions.

#### 2. What is collaborative filtering, and how does it work?

**Answer:**

- **Collaborative Filtering**:
  - Collaborative filtering is a method used by recommendation systems to predict what items a user might like based on the preferences of many users. It leverages the collective behavior of users (such as ratings, likes, or clicks) to make recommendations.

- **How It Works**:
  - Collaborative filtering works by identifying patterns and relationships in user behavior. There are two main types:
    1. **User-Based Collaborative Filtering**: 
       - Finds similarities between users. For a given user, the system recommends items that similar users have liked.
       - Steps:
         1. Calculate the similarity between users.
         2. Select a set of similar users.
         3. Recommend items that these users have liked.
       
    2. **Item-Based Collaborative Filtering**: 
       - Finds similarities between items. For a given item, the system recommends other items that are similar.
       - Steps:
         1. Calculate the similarity between items.
         2. For a given user, recommend items similar to those the user has liked.
         
- **Advantages**:
  - Can provide recommendations without needing detailed information about items (content), only user behavior is required.
  
- **Challenges**:
  - **Cold Start Problem**: Difficulty in recommending items to new users or for new items that have no ratings.
  - **Sparsity**: When user-item interactions are sparse, finding reliable similar users or items becomes challenging.

