In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import random

# Load dataset
anime_df = pd.read_csv(r"C:\Users\atanu\OneDrive\Desktop\data science assignment\Recommendation System\anime.csv")

# Preprocessing
anime_df.dropna(subset=['genre', 'rating'], inplace=True)  # Drop rows with missing genres or ratings
anime_df.fillna({'episodes': 0}, inplace=True)

# Feature Extraction
# Use 'genre' as categorical feature
tfidf = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")  # Split genres properly
genre_matrix = tfidf.fit_transform(anime_df['genre'])

# Normalize numerical features (rating and members)
scaler = MinMaxScaler()
numeric_features = scaler.fit_transform(anime_df[['rating', 'members']])

# Combine genre and numeric features
from scipy.sparse import hstack
combined_features = hstack([genre_matrix, numeric_features])

# Cosine similarity matrix
cos_sim = cosine_similarity(combined_features)

# Recommendation Function
def recommend_anime(anime_name, top_n=5, threshold=0.2):
    if anime_name not in anime_df['name'].values:
        return f"Anime '{anime_name}' not found in dataset."
    
    idx = anime_df[anime_df['name'] == anime_name].index[0]
    similarity_scores = list(enumerate(cos_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    recommendations = []
    for i, score in similarity_scores[1:]:
        if score > threshold:
            recommendations.append((anime_df.iloc[i]['name'], score))
        if len(recommendations) >= top_n:
            break
    return recommendations

# Example usage
print("Recommendations for 'Naruto':")
recs = recommend_anime('Naruto')
for title, score in recs:
    print(f" - {title} (score: {score:.2f})")

# Evaluation
# Simulate test set by randomly sampling anime titles
train_set, test_set = train_test_split(anime_df['name'].values, test_size=0.1, random_state=42)

y_true = []
y_pred = []

for anime in random.sample(list(test_set),100):
    recommendations = recommend_anime(anime_df['name'], top_n=5)
    recommended_titles = [r[0] for r in recommendations]
    actual = anime_df[anime_df['name'] == anime]['genre'].values[0].split(', ') if anime in anime_df['name'].values else []
    
    for rec in recommended_titles:
        if rec in anime_df['name'].values:
            rec_genre = anime_df[anime_df['name'] == rec]['genre'].values[0].split(', ')
            intersection = set(rec_genre).intersection(set(actual))
            y_true.append(1 if intersection else 0)
            y_pred.append(1)

# Evaluation Metrics
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print("\nEvaluation Results:")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")

Recommendations for 'Naruto':
 - Naruto: Shippuuden (score: 1.00)
 - Dragon Ball Z (score: 0.94)
 - Dragon Ball (score: 0.92)
 - Naruto: Shippuuden Movie 4 - The Lost Tower (score: 0.91)
 - Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono (score: 0.91)

Evaluation Results:
Precision: 0.470
Recall: 1.000
F1-Score: 0.639
