In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score

print("Loading the anime dataset...")
anime_df = pd.read_csv("anime.csv")

print("Initial data shape:", anime_df.shape)
anime_df.dropna(subset=['genre', 'rating'], inplace=True)
print("Shape after dropping rows with missing values:", anime_df.shape)


anime_df['episodes'] = anime_df['episodes'].replace('Unknown', 1)
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'])

print("\nData Exploration")
print(anime_df.info())
print("\nSample of the processed data:")
print(anime_df[['name', 'genre', 'rating', 'members', 'episodes']].head())

print("\nFeature Extraction")
print("Processing genres...")
genres_df = anime_df['genre'].str.split(', ').apply(lambda x: ' '.join(x))
genre_vectorizer = CountVectorizer()
genre_matrix = genre_vectorizer.fit_transform(genres_df)


print("Scaling numerical features...")
numerical_features = ['rating', 'members', 'episodes']
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(anime_df[numerical_features])
scaled_features_df = pd.DataFrame(scaled_features, columns=numerical_features, index=anime_df.index)


print("Combining all features...")
feature_matrix = np.hstack((genre_matrix.toarray(), scaled_features_df))
print("Final feature matrix shape:", feature_matrix.shape)


print("\nBuilding Recommendation System")
print("Calculating cosine similarity matrix...")
cosine_sim = cosine_similarity(feature_matrix)
print("Cosine similarity matrix shape:", cosine_sim.shape)

def recommend_anime(anime_title, num_recommendations=10):
    """
    Recommends a list of anime similar to the input anime based on cosine similarity.
    """
    if anime_title not in anime_df['name'].values:
        print(f"'{anime_title}' not found in the dataset.")
        return []
    
    idx = anime_df.index[anime_df['name'] == anime_title].tolist()[0]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = sim_scores[1:num_recommendations+1]
    
    anime_indices = [i[0] for i in sim_scores]
    
    recommended_anime = anime_df.loc[anime_indices, ['name', 'genre', 'rating']]
    return recommended_anime

print("\nSample Recommendations")
target_anime = 'Death Note'
recommendations = recommend_anime(target_anime, num_recommendations=5)
print(f"Recommendations for '{target_anime}':\n")
print(recommendations)

def recommend_by_threshold(anime_title, threshold=0.75):
    """
    Recommends anime with a similarity score above a given threshold.
    """
    if anime_title not in anime_df['name'].values:
        print(f"'{anime_title}' not found in the dataset.")
        return []

    idx = anime_df.index[anime_df['name'] == anime_title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Filter for scores above the threshold and sort
    sim_scores = [score for score in sim_scores if score[1] > threshold]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    anime_indices = [i[0] for i in sim_scores]
    
    recommended_anime = anime_df.loc[anime_indices, ['name', 'genre', 'rating']]
    return recommended_anime.iloc[1:] # Exclude the target anime itself

print("\n--- Recommendations with a Similarity Threshold of 0.75 ---")
recs_threshold = recommend_by_threshold('Code Geass: Hangyaku no Lelouch R2', threshold=0.75)
print(recs_threshold)





test_set_titles = [
    'Steins;Gate',
    'Fullmetal Alchemist: Brotherhood',
    'Hunter x Hunter (2011)',
    'Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou'
]


def recommend_anime(anime_title, num_recommendations=10):
    """
    Retrieves recommendations based on cosine similarity scores.
    """
    if anime_title not in anime_df['name'].values:
        return []
    
    idx = anime_df.index[anime_df['name'] == anime_title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    
    anime_indices = [i[0] for i in sim_scores]
    return anime_df.loc[anime_indices, 'name'].tolist()


target_anime = test_set_titles[0]
recommendations = recommend_anime(target_anime, num_recommendations=10)


y_true = []
y_pred = []

for rec in recommendations:
    if rec in test_set_titles[1:]:
        y_pred.append(1)  
    else:
        y_pred.append(0)  
    y_true.append(1 if rec in test_set_titles[1:] else 0)


precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)


print("\nEvaluation of Simulated Test")
print(f"Target Anime: {target_anime}")
print(f"Simulated Ground Truth (Rest of Test Set): {test_set_titles[1:]}")
print(f"Model Recommendations: {recommendations}")
print("\nPerformance Metrics")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")




print(" --------------Interview Question Answers:-----------------")
print(f" 1Q: User-based collaborative filtering finds similar users and recommends items they liked, item-based collaborative filtering finds similar items and recommends items similar to what the user already liked.")
print(f" 2Q: Collaborative filtering is a recommendation technique that uses the collective preferences of a group of users to predict what a user will like, working by finding similarities between either users or items to generate recommendations.")

Loading the anime dataset...
Initial data shape: (12294, 7)
Shape after dropping rows with missing values: (12017, 7)

Data Exploration
<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12017 non-null  int64  
 1   name      12017 non-null  object 
 2   genre     12017 non-null  object 
 3   type      12017 non-null  object 
 4   episodes  12017 non-null  int64  
 5   rating    12017 non-null  float64
 6   members   12017 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 751.1+ KB
None

Sample of the processed data:
                               name  \
0                    Kimi no Na wa.   
1  Fullmetal Alchemist: Brotherhood   
2                          Gintama°   
3                       Steins;Gate   
4                     Gintama&#039;   

                                               genre  rating  members  \
0 