In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

print("Step 1: Data Preprocessing...")
file_path = "anime.csv"
anime_df = pd.read_csv(file_path)


anime_df.dropna(subset=['genre'], inplace=True)

anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)

anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce').fillna(0).astype(int)

print("\nDataset Information:")
anime_df.info()
print("\nFirst 5 rows of the dataset:")
print(anime_df.head())

print("\nStep 2: Feature Extraction...")

anime_df['genre_list'] = anime_df['genre'].str.split(',').apply(lambda x: [g.strip() for g in x])

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(anime_df['genre_list'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)

numerical_features = anime_df[['rating', 'members', 'episodes']].values
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(numerical_features)
numerical_df = pd.DataFrame(numerical_scaled, columns=['rating_scaled', 'members_scaled', 'episodes_scaled'])

features_df = pd.concat([genre_df.reset_index(drop=True), numerical_df.reset_index(drop=True)], axis=1)

print("\nStep 3: Building the Recommendation System...")

cosine_sim_matrix = cosine_similarity(features_df)
print(f"Cosine similarity matrix shape: {cosine_sim_matrix.shape}")

def recommend_anime(anime_title, sim_matrix, df, threshold=0.9, top_n=10):
    """
    Recommends a list of similar anime based on cosine similarity.
    
    Args:
        anime_title (str): The title of the target anime.
        sim_matrix (np.ndarray): The cosine similarity matrix.
        df (pd.DataFrame): The original anime dataframe.
        threshold (float): The minimum similarity score to consider a recommendation.
        top_n (int): The maximum number of recommendations to return.
        
    Returns:
        pd.DataFrame: A DataFrame of recommended anime with their similarity scores.
    """
    if anime_title not in df['name'].values:
        print(f"'{anime_title}' not found in the dataset.")
        return pd.DataFrame(columns=['name', 'similarity_score'])

    target_idx = df[df['name'] == anime_title].index[0]

    sim_scores = sim_matrix[target_idx]

    sim_df = pd.DataFrame({'name': df['name'], 'similarity_score': sim_scores})
    
    recommended_df = sim_df[sim_df['similarity_score'] >= threshold]
    recommended_df = recommended_df[recommended_df['name'] != anime_title]
    recommended_df = recommended_df.sort_values(by='similarity_score', ascending=False)
    
    return recommended_df.head(top_n)

target_anime = 'Fullmetal Alchemist: Brotherhood'
print(f"\nRecommendations for '{target_anime}':")
recommendations = recommend_anime(target_anime, cosine_sim_matrix, anime_df, threshold=0.8, top_n=10)
print(recommendations)

print("\nRecommendations with a lower threshold (0.6):")
recommendations_low_threshold = recommend_anime(target_anime, cosine_sim_matrix, anime_df, threshold=0.6, top_n=10)
print(recommendations_low_threshold)


print("\nStep 4: Evaluation...")

target_anime_eval = 'Cowboy Bebop'
ground_truth_relevant_anime = [
    'Samurai Champloo',  
    'Trigun',            
    'Ghost in the Shell', 
    'Darker than Black',  
]

predicted_recommendations_df = recommend_anime(target_anime_eval, cosine_sim_matrix, anime_df, threshold=0.6, top_n=20)
predicted_recommendations_list = predicted_recommendations_df['name'].tolist()

ground_truth_set = set(ground_truth_relevant_anime)
predicted_set = set(predicted_recommendations_list)

true_positives = len(ground_truth_set.intersection(predicted_set))
false_positives = len(predicted_set.difference(ground_truth_set))
false_negatives = len(ground_truth_set.difference(predicted_set))

precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"\nEvaluation Metrics for '{target_anime_eval}':")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Step 1: Data Preprocessing...

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
Index: 12232 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12232 non-null  int64  
 1   name      12232 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12210 non-null  object 
 4   episodes  12232 non-null  int64  
 5   rating    12232 non-null  float64
 6   members   12232 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 764.5+ KB

First 5 rows of the dataset:
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type  episodes  rating  \
0               Drama, R

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)


Cosine similarity matrix shape: (12232, 12232)

Recommendations for 'Fullmetal Alchemist: Brotherhood':
                                                  name  similarity_score
200                                Fullmetal Alchemist          0.940303
1558     Fullmetal Alchemist: The Sacred Star of Milos          0.909676
402          Fullmetal Alchemist: Brotherhood Specials          0.904959
268                       Magi: The Labyrinth of Magic          0.858158
101                         Magi: The Kingdom of Magic          0.853412
795                     Densetsu no Yuusha no Densetsu          0.842569
290                        Magi: Sinbad no Bouken (TV)          0.837816
461                             Magi: Sinbad no Bouken          0.835301
879                Tales of Vesperia: The First Strike          0.832303
1991  Tsubasa Chronicle: Tori Kago no Kuni no Himegimi          0.831379

Recommendations with a lower threshold (0.6):
                                              