In [9]:
import pandas as pd
import numpy as np

# Correct file path with filename
file_path = "D:\\Assignments questions\\Recommendation System\\anime.csv"
anime_df = pd.read_csv(file_path)

anime_df.isnull().sum()



anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [10]:
# 1. Handle missing values
anime_df['genre'] = anime_df['genre'].fillna("Unknown")
anime_df['type'] = anime_df['type'].fillna("Unknown")
anime_df['rating'] = anime_df['rating'].fillna(anime_df['rating'].mean())

# 2. Convert episodes column to numeric
anime_df['episodes'] = anime_df['episodes'].replace('Unknown', np.nan)
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')
anime_df['episodes'] = anime_df['episodes'].fillna(anime_df['episodes'].median())

# 3. Verify cleaning
print("Missing values after cleaning:")
print(anime_df.isnull().sum())

print("\nEpisodes column type:", anime_df['episodes'].dtype)
print("\nCleaned dataset preview:")
print(anime_df.head())

Missing values after cleaning:
anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

Episodes column type: float64

Cleaned dataset preview:
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type  episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie       1.0    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV      64.0    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV      51.0    9.25   
3                                   Sci-Fi, Thriller     TV      24.0    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV      51.0    9.16   

   members  
0   20

In [11]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

# ---- 1. Handle genres ----
# Split genres into list
anime_df['genre'] = anime_df['genre'].apply(lambda x: x.split(", ") if isinstance(x, str) else [])

mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(anime_df['genre']), columns=mlb.classes_, index=anime_df.index)

# ---- 2. Handle type (optional) ----
type_encoded = pd.get_dummies(anime_df['type'], prefix="type")

# ---- 3. Normalize numerical features ----
scaler = MinMaxScaler()
numeric_scaled = scaler.fit_transform(anime_df[['rating','members']])
numeric_df = pd.DataFrame(numeric_scaled, columns=['rating','members'], index=anime_df.index)

# ---- 4. Combine all features ----
features_df = pd.concat([genre_encoded, type_encoded, numeric_df], axis=1)

print("Final feature matrix shape:", features_df.shape)
print(features_df.head())


Final feature matrix shape: (12294, 53)
   Action  Adventure  Cars  Comedy  Dementia  Demons  Drama  Ecchi  Fantasy  \
0       0          0     0       0         0       0      1      0        0   
1       1          1     0       0         0       0      1      0        1   
2       1          0     0       1         0       0      0      0        0   
3       0          0     0       0         0       0      0      0        0   
4       1          0     0       1         0       0      0      0        0   

   Game  ...  Yuri  type_Movie  type_Music  type_ONA  type_OVA  type_Special  \
0     0  ...     0        True       False     False     False         False   
1     0  ...     0       False       False     False     False         False   
2     0  ...     0       False       False     False     False         False   
3     0  ...     0       False       False     False     False         False   
4     0  ...     0       False       False     False     False         False   

   t

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(features_df)

def recommend_anime(title, n=5):
    # Check if anime exists
    if title not in anime_df['name'].values:
        return f"Anime '{title}' not found."

    # Find index of given anime
    idx = anime_df[anime_df['name'] == title].index[0]

    # Get similarity scores (with index)
    scores = list(enumerate(cosine_sim[idx]))

    # Sort by similarity (skip itself at index 0)
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:n+1]

    # Collect recommendations
    result = [(anime_df.iloc[i]['name'], round(score, 3)) for i, score in scores]

    return result

# Example usage
print(recommend_anime("Naruto", n=5))


[('Naruto: Shippuuden', np.float64(0.998)), ('Katekyo Hitman Reborn!', np.float64(0.912)), ('Dragon Ball Z', np.float64(0.873)), ('Bleach', np.float64(0.856)), ('Dragon Ball Kai', np.float64(0.856))]


In [17]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)
print("Training set size:", len(train_df))
print("Testing set size:", len(test_df))


Training set size: 9835
Testing set size: 2459


In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommender(test_df, n=5, sample_size=50):
    y_true, y_pred = [], []

    # Sample some test anime (not too many for speed)
    for title in test_df['name'].sample(sample_size, random_state=42):
        recs = recommend_anime(title, n=n)   # our recommender
        rec_titles = [r[0] for r in recs]

        # True relevant = anime sharing at least 1 genre
        target_genres = set(anime_df.loc[anime_df['name'] == title, 'genre'].values[0])
        true_relevant = anime_df[anime_df['genre'].apply(lambda g: len(set(g) & target_genres) > 0)]['name'].values

        # Build binary relevance vectors
        y_true_row = [1 if anime in true_relevant else 0 for anime in rec_titles]
        y_pred_row = [1] * len(rec_titles)   # all recs are predicted relevant

        y_true.extend(y_true_row)
        y_pred.extend(y_pred_row)

    # Compute metrics
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return precision, recall, f1

precision, recall, f1 = evaluate_recommender(test_df, n=5)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")


Precision: 1.00, Recall: 1.00, F1-score: 1.00


In [20]:
'''1.User-based CF → Finds users who are similar to you, then recommends items they liked.
Example: If User A and User B have similar tastes, recommend to A what B liked.
Item-based CF → Finds items similar to the ones you liked, then recommends them.
Example: If you liked Naruto, recommend Bleach because people who liked Naruto also liked Bleach.'''

'User-based CF → Finds users who are similar to you, then recommends items they liked.\nExample: If User A and User B have similar tastes, recommend to A what B liked.\nItem-based CF → Finds items similar to the ones you liked, then recommends them.\nExample: If you liked Naruto, recommend Bleach because people who liked Naruto also liked Bleach.'

In [None]:
'''2.Collaborative Filtering (CF) = Making recommendations based on the behavior of many users (ratings, clicks, purchases).
It works by finding patterns of similarity between users or items → “people who are similar like similar things.”'''