In [1]:
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore")

#  Load the dataset and handle missing values

In [3]:
# Load the dataset
anime_df = pd.read_csv("anime.csv")

In [4]:
# Handle missing values
anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['type'].fillna('Unknown', inplace=True)
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)
anime_df['episodes'] = anime_df['episodes'].replace('Unknown', 0).astype(int)

In [5]:
# Explore the dataset
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12294 non-null  object 
 4   episodes  12294 non-null  int32  
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int32(1), int64(2), object(3)
memory usage: 624.4+ KB


In [6]:
print(anime_df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type  episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie         1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV        64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV        51    9.25   
3                                   Sci-Fi, Thriller     TV        24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV        51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


# Feature Extraction

In [7]:
# Combine 'genre' and 'type' into a single feature
anime_df['combined_features'] = anime_df['genre'] + ' ' + anime_df['type']

In [8]:
# Convert text data into numerical vectors using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer

In [10]:
tfidf_matrix = vectorizer.fit_transform(anime_df['combined_features'])
tfidf_matrix

<12294x52 sparse matrix of type '<class 'numpy.float64'>'
	with 52284 stored elements in Compressed Sparse Row format>

In [11]:
# Normalize numerical features if required (e.g., rating)
anime_df['normalized_rating'] = (anime_df['rating'] - anime_df['rating'].min()) / (anime_df['rating'].max() - anime_df['rating'].min())

In [12]:
anime_df['normalized_rating']

0        0.924370
1        0.911164
2        0.909964
3        0.900360
4        0.899160
           ...   
12289    0.297719
12290    0.313325
12291    0.385354
12292    0.397359
12293    0.454982
Name: normalized_rating, Length: 12294, dtype: float64

# Recommendation System

In [13]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors

In [14]:
# Fit Nearest Neighbors model with cosine similarity
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(tfidf_matrix)

In [15]:
# Get the index of the anime that matches the title
title = 'Fullmetal Alchemist: Brotherhood'
idx = anime_df[anime_df['name'] == title].index[0]

In [16]:
# Find the nearest neighbors
distances, indices = nn_model.kneighbors(tfidf_matrix[idx], n_neighbors=10 + 1)

In [17]:
# Filter based on the threshold
threshold = 0.5
recommended_indices = [i for i, d in zip(indices.flatten()[1:], distances.flatten()[1:]) if d <= threshold]

In [18]:
# Return the anime titles
recommendations = anime_df['name'].iloc[recommended_indices]

# Display recommendations
print(recommendations)

200                                Fullmetal Alchemist
1558     Fullmetal Alchemist: The Sacred Star of Milos
402          Fullmetal Alchemist: Brotherhood Specials
4264                                    Tide-Line Blue
795                     Densetsu no Yuusha no Densetsu
290                        Magi: Sinbad no Bouken (TV)
101                         Magi: The Kingdom of Magic
10953            Chain Chronicle: Haecceitas no Hikari
268                       Magi: The Labyrinth of Magic
879                Tales of Vesperia: The First Strike
Name: name, dtype: object


# Evaluation

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [20]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(anime_df, test_size=0.2, random_state=42)

In [21]:
# Fit Nearest Neighbors model on training data
tfidf_train = vectorizer.transform(train_data['combined_features'])
nn_model.fit(tfidf_train)

In [22]:
# Get recommendations for all test set entries
indices = [anime_df[anime_df['name'] == title].index[0] for title in test_data['name'].head(50)]
distances, neighbors = nn_model.kneighbors(tfidf_matrix[indices], n_neighbors=10 + 1)

In [23]:
# Extract recommendations excluding the anime itself
recommended = [anime_df['name'].iloc[neighbor[1:]].values for neighbor in neighbors]

In [24]:
# Convert to binary format for compatibility with classification metrics
actual = [[title] for title in test_data['name'].head(50)]

In [25]:
# Flatten the lists for evaluation
actual_flat = [item for sublist in actual for item in sublist]
predicted_flat = [item for sublist in recommended for item in sublist]

In [26]:
# Ensure actual and predicted have the same length
min_len = min(len(actual_flat), len(predicted_flat))
actual_flat = actual_flat[:min_len]
predicted_flat = predicted_flat[:min_len]


In [27]:
# Placeholder for evaluation metrics (requires ground truth for proper calculation)
precision = precision_score(actual_flat, predicted_flat, average='micro', zero_division=1)
recall = recall_score(actual_flat, predicted_flat, average='micro', zero_division=1)
f1 = f1_score(actual_flat, predicted_flat, average='micro', zero_division=1)

In [28]:
# Print evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 0.0
Recall: 0.0
F1 Score: 0.0
