In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [35]:
!pip install pandas scikit-learn




In [36]:
df=pd.read_csv('/content/anime.csv')

In [37]:
print(df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [38]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [39]:
# Check for missing values in the dataset
print(df.isnull().sum())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [40]:
# Data preprocessing - Genre is a text column, so we need to vectorize it
# Using TF-IDF to transform the 'genre' column
tfidf = TfidfVectorizer(stop_words='english')
df['genre'] = df['genre'].fillna('')  # Fill missing genre with an empty string
genre_matrix = tfidf.fit_transform(df['genre'])

In [41]:
# Normalize numerical features: 'rating', 'episodes'
scaler = MinMaxScaler()

In [42]:
df['rating'] = df['rating'].fillna(df['rating'].mean())  # If any missing rating, fill with mean
df['episodes'] = df['episodes'].replace('Unknown', 0).astype(int)  # Handle 'Unknown' in episodes

df[['rating', 'episodes']] = scaler.fit_transform(df[['rating', 'episodes']])

In [43]:
# Combine 'genre', 'rating', and 'episodes' into one feature matrix
feature_matrix = pd.concat([pd.DataFrame(genre_matrix.toarray()), df[['rating', 'episodes']].reset_index(drop=True)], axis=1)


In [46]:
# Assuming the actual anime title column is 'name' (replace 'name' with the actual column name if different)
def recommend_anime(title, df, cosine_sim, top_n=10):
    # Get the index of the anime by title
    idx = df[df['name'] == title].index[0]

    # Get similarity scores for all anime based on the target anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort anime based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top N most similar anime
    sim_scores = sim_scores[1:top_n+1]  # Exclude the anime itself
    anime_indices = [i[0] for i in sim_scores]

    # Return the most similar anime titles
    return df['name'].iloc[anime_indices]

# Example: Recommend anime similar to 'Naruto'
recommendations = recommend_anime('Naruto', df, cosine_sim, top_n=5)
print(recommendations)


1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
615                                    Naruto: Shippuuden
486                              Boruto: Naruto the Movie
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
Name: name, dtype: object


In [48]:
# Split the dataset into training and testing sets (useful if you have user ratings for evaluation)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
